Reference for ultralytics/models/sam/sam3/tokenizer_ve.py
Improvements
This page is sourced from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/sam/sam3/tokenizer_ve.py. Have an improvement or example to add? Open a Pull Request — thank you! 🙏
Summary
class ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer
def __init__(
self,
bpe_path: str | os.PathLike,
additional_special_tokens: list[str] | None = None,
context_length: int = 77,
clean: str = "lower",
)
A simple tokenizer for text inputs.
Args
| Name | Type | Description | Default |
|---|---|---|---|
bpe_path | str | os.PathLike | required | |
additional_special_tokens | list[str] | None | None | |
context_length | int | 77 | |
clean | str | "lower" |
Methods
| Name | Description |
|---|---|
__call__ | Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str, |
bpe | Byte Pair Encoding. |
decode | Decodes a sequence of tokens back into a text string. |
encode | Encode text to a sequence of BPE tokens. |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubclass SimpleTokenizer:
"""A simple tokenizer for text inputs."""
def __init__(
self,
bpe_path: str | os.PathLike,
additional_special_tokens: list[str] | None = None,
context_length: int = 77,
clean: str = "lower",
):
"""The tokenizer for text inputs."""
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with g_pathmgr.open(bpe_path, "rb") as fh:
bpe_bytes = io.BytesIO(fh.read())
merges = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
# merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
merges = merges[1 : 49152 - 256 - 2 + 1]
merges = [tuple(merge.split()) for merge in merges]
vocab = list(bytes_to_unicode().values())
vocab = vocab + [v + "</w>" for v in vocab]
for merge in merges:
vocab.append("".join(merge))
special_tokens = ["<start_of_text>", "<end_of_text>"]
if additional_special_tokens:
special_tokens += additional_special_tokens
vocab.extend(special_tokens)
self.encoder = dict(zip(vocab, range(len(vocab))))
self.decoder = {v: k for k, v in self.encoder.items()}
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {t: t for t in special_tokens}
special = "|".join(special_tokens)
self.pat = re.compile(
special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
re.IGNORECASE,
)
self.vocab_size = len(self.encoder)
self.all_special_ids = [self.encoder[t] for t in special_tokens]
self.sot_token_id = self.all_special_ids[0]
self.eot_token_id = self.all_special_ids[1]
self.context_length = context_length
self.clean_fn = get_clean_fn(clean)
method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.__call__
def __call__(self, texts: str | list[str], context_length: int | None = None) -> torch.LongTensor
Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str,
list[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length.
Args
| Name | Type | Description | Default |
|---|---|---|---|
texts | str | list[str] | required | |
context_length | int | None | None |
Returns
| Type | Description |
|---|---|
------- | A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef __call__(self, texts: str | list[str], context_length: int | None = None) -> torch.LongTensor:
"""Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str,
list[str]] An input string or a list of input strings to tokenize context_length : int The context
length to use; all CLIP models use 77 as the context length.
Returns:
-------: A two-dimensional tensor containing the resulting tokens, shape = [number of input strings,
context_length]
"""
if isinstance(texts, str):
texts = [texts]
context_length = context_length or self.context_length
assert context_length, "Please set a valid context length"
all_tokens = [[self.sot_token_id, *self.encode(text), self.eot_token_id] for text in texts]
result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
for i, tokens in enumerate(all_tokens):
if len(tokens) > context_length:
tokens = tokens[:context_length] # Truncate
tokens[-1] = self.eot_token_id
result[i, : len(tokens)] = torch.tensor(tokens)
return result
method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.bpe
def bpe(self, token)
Byte Pair Encoding.
Args
| Name | Type | Description | Default |
|---|---|---|---|
token | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef bpe(self, token):
"""Byte Pair Encoding."""
if token in self.cache:
return self.cache[token]
word = (*tuple(token[:-1]), token[-1] + "</w>")
pairs = get_pairs(word)
if not pairs:
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except Exception:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.decode
def decode(self, tokens)
Decodes a sequence of tokens back into a text string.
Args
| Name | Type | Description | Default |
|---|---|---|---|
tokens | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef decode(self, tokens):
"""Decodes a sequence of tokens back into a text string."""
text = "".join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
return text
method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.encode
def encode(self, text)
Encode text to a sequence of BPE tokens.
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef encode(self, text):
"""Encode text to a sequence of BPE tokens."""
bpe_tokens = []
text = self.clean_fn(text)
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
function ultralytics.models.sam.sam3.tokenizer_ve.bytes_to_unicode
def bytes_to_unicode()
Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode
strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHub@lru_cache
def bytes_to_unicode():
"""Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode
strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When
you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a
significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8
bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
function ultralytics.models.sam.sam3.tokenizer_ve.get_pairs
def get_pairs(word)
Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
strings).
Args
| Name | Type | Description | Default |
|---|---|---|---|
word | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef get_pairs(word):
"""Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
function ultralytics.models.sam.sam3.tokenizer_ve.basic_clean
def basic_clean(text)
Basic text cleaning: fix unicode and unescape HTML entities.
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef basic_clean(text):
"""Basic text cleaning: fix unicode and unescape HTML entities."""
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()
function ultralytics.models.sam.sam3.tokenizer_ve.whitespace_clean
def whitespace_clean(text)
Remove redundant whitespace.
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef whitespace_clean(text):
"""Remove redundant whitespace."""
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
function ultralytics.models.sam.sam3.tokenizer_ve._clean_canonicalize
def _clean_canonicalize(x)
Clean text and canonicalize it.
Args
| Name | Type | Description | Default |
|---|---|---|---|
x | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef _clean_canonicalize(x):
"""Clean text and canonicalize it."""
# basic, remove whitespace, remove punctuation, lower case
return canonicalize_text(basic_clean(x))
function ultralytics.models.sam.sam3.tokenizer_ve._clean_lower
def _clean_lower(x)
Clean text and return lowercase.
Args
| Name | Type | Description | Default |
|---|---|---|---|
x | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef _clean_lower(x):
"""Clean text and return lowercase."""
# basic, remove whitespace, lower case
return whitespace_clean(basic_clean(x)).lower()
function ultralytics.models.sam.sam3.tokenizer_ve._clean_whitespace
def _clean_whitespace(x)
Clean text and remove redundant whitespace.
Args
| Name | Type | Description | Default |
|---|---|---|---|
x | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef _clean_whitespace(x):
"""Clean text and remove redundant whitespace."""
# basic, remove whitespace
return whitespace_clean(basic_clean(x))
function ultralytics.models.sam.sam3.tokenizer_ve.get_clean_fn
def get_clean_fn(type: str)
Get text cleaning function by name.
Args
| Name | Type | Description | Default |
|---|---|---|---|
type | str | required |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef get_clean_fn(type: str):
"""Get text cleaning function by name."""
if type == "canonicalize":
return _clean_canonicalize
elif type == "lower":
return _clean_lower
elif type == "whitespace":
return _clean_whitespace
else:
assert False, f"Invalid clean function ({type})."
function ultralytics.models.sam.sam3.tokenizer_ve.canonicalize_text
def canonicalize_text(text, *, keep_punctuation_exact_string = None)
Returns canonicalized text (lowercase and punctuation removed). From:
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | string to be canonicalized. | required | |
keep_punctuation_exact_string | If provided, then this exact string kept. For example providing '{}' will keep any occurrences of '{}' (but will still remove '{' and '}' that appear separately). | None |
Source code in ultralytics/models/sam/sam3/tokenizer_ve.py
View on GitHubdef canonicalize_text(text, *, keep_punctuation_exact_string=None):
"""Returns canonicalized `text` (lowercase and punctuation removed). From:
https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94.
Args:
text: string to be canonicalized.
keep_punctuation_exact_string: If provided, then this exact string kept. For example providing '{}' will keep
any occurrences of '{}' (but will still remove '{' and '}' that appear separately).
"""
text = text.replace("_", " ")
if keep_punctuation_exact_string:
text = keep_punctuation_exact_string.join(
part.translate(str.maketrans("", "", string.punctuation))
for part in text.split(keep_punctuation_exact_string)
)
else:
text = text.translate(str.maketrans("", "", string.punctuation))
text = text.lower()
text = re.sub(r"\s+", " ", text)
return text.strip()