Reference for `ultralytics/models/sam/sam3/tokenizer_ve.py`

Improvements

This page is sourced from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/sam/sam3/tokenizer_ve.py. Have an improvement or example to add? Open a Pull Request — thank you! 🙏

Summary

ClassesMethodsFunctions

SimpleTokenizer

SimpleTokenizer.bpe
SimpleTokenizer.encode
SimpleTokenizer.decode
SimpleTokenizer.__call__

bytes_to_unicode
get_pairs
basic_clean
whitespace_clean
_clean_canonicalize
_clean_lower
_clean_whitespace
get_clean_fn
canonicalize_text

class `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer`

def __init__(
    self,
    bpe_path: str | os.PathLike,
    additional_special_tokens: list[str] | None = None,
    context_length: int = 77,
    clean: str = "lower",
)

A simple tokenizer for text inputs.

Args

Name	Type	Default
`bpe_path`	`str \| os.PathLike`	required
`additional_special_tokens`	`list[str] \| None`	`None`
`context_length`	`int`	`77`
`clean`	`str`	`"lower"`

Methods

Name	Description
`__call__`	Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str,
`bpe`	Byte Pair Encoding.
`decode`	Decodes a sequence of tokens back into a text string.
`encode`	Encode text to a sequence of BPE tokens.

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

class SimpleTokenizer:
    """A simple tokenizer for text inputs."""

    def __init__(
        self,
        bpe_path: str | os.PathLike,
        additional_special_tokens: list[str] | None = None,
        context_length: int = 77,
        clean: str = "lower",
    ):
        """The tokenizer for text inputs."""
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        with g_pathmgr.open(bpe_path, "rb") as fh:
            bpe_bytes = io.BytesIO(fh.read())
            merges = gzip.open(bpe_bytes).read().decode("utf-8").split("\n")
        # merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
        merges = merges[1 : 49152 - 256 - 2 + 1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v + "</w>" for v in vocab]
        for merge in merges:
            vocab.append("".join(merge))
        special_tokens = ["<start_of_text>", "<end_of_text>"]
        if additional_special_tokens:
            special_tokens += additional_special_tokens
        vocab.extend(special_tokens)
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {t: t for t in special_tokens}
        special = "|".join(special_tokens)
        self.pat = re.compile(
            special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
            re.IGNORECASE,
        )
        self.vocab_size = len(self.encoder)
        self.all_special_ids = [self.encoder[t] for t in special_tokens]
        self.sot_token_id = self.all_special_ids[0]
        self.eot_token_id = self.all_special_ids[1]
        self.context_length = context_length
        self.clean_fn = get_clean_fn(clean)

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.call`

def __call__(self, texts: str | list[str], context_length: int | None = None) -> torch.LongTensor

Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str,

list[str]] An input string or a list of input strings to tokenize context_length : int The context length to use; all CLIP models use 77 as the context length.

Args

Name	Type	Description	Default
`texts`	`str \| list[str]`		required
`context_length`	`int \| None`		`None`

Returns

Type	Description
`-------`	A two-dimensional tensor containing the resulting tokens, shape = [number of input strings,

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def __call__(self, texts: str | list[str], context_length: int | None = None) -> torch.LongTensor:
    """Returns the tokenized representation of given input string(s) Parameters. ---------- texts : Union[str,
    list[str]] An input string or a list of input strings to tokenize context_length : int The context
    length to use; all CLIP models use 77 as the context length.

    Returns:
        -------: A two-dimensional tensor containing the resulting tokens, shape = [number of input strings,
            context_length]
    """
    if isinstance(texts, str):
        texts = [texts]
    context_length = context_length or self.context_length
    assert context_length, "Please set a valid context length"
    all_tokens = [[self.sot_token_id, *self.encode(text), self.eot_token_id] for text in texts]
    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            tokens = tokens[:context_length]  # Truncate
            tokens[-1] = self.eot_token_id
        result[i, : len(tokens)] = torch.tensor(tokens)
    return result

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.bpe`

def bpe(self, token)

Byte Pair Encoding.

Args

Name	Type	Description	Default
`token`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def bpe(self, token):
    """Byte Pair Encoding."""
    if token in self.cache:
        return self.cache[token]
    word = (*tuple(token[:-1]), token[-1] + "</w>")
    pairs = get_pairs(word)
    if not pairs:
        return token + "</w>"
    while True:
        bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
        if bigram not in self.bpe_ranks:
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except Exception:
                new_word.extend(word[i:])
                break
            if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                new_word.append(first + second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_word = tuple(new_word)
        word = new_word
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)
    word = " ".join(word)
    self.cache[token] = word
    return word

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.decode`

def decode(self, tokens)

Decodes a sequence of tokens back into a text string.

Args

Name	Type	Description	Default
`tokens`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def decode(self, tokens):
    """Decodes a sequence of tokens back into a text string."""
    text = "".join([self.decoder[token] for token in tokens])
    text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
    return text

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.encode`

def encode(self, text)

Encode text to a sequence of BPE tokens.

Args

Name	Type	Description	Default
`text`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def encode(self, text):
    """Encode text to a sequence of BPE tokens."""
    bpe_tokens = []
    text = self.clean_fn(text)
    for token in re.findall(self.pat, text):
        token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
        bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
    return bpe_tokens

function `ultralytics.models.sam.sam3.tokenizer_ve.bytes_to_unicode`

def bytes_to_unicode()

Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode

strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

@lru_cache
def bytes_to_unicode():
    """Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode
    strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When
    you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a
    significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8
    bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

function `ultralytics.models.sam.sam3.tokenizer_ve.get_pairs`

def get_pairs(word)

Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length

strings).

Args

Name	Type	Description	Default
`word`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def get_pairs(word):
    """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
    strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

function `ultralytics.models.sam.sam3.tokenizer_ve.basic_clean`

def basic_clean(text)

Basic text cleaning: fix unicode and unescape HTML entities.

Args

Name	Type	Description	Default
`text`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def basic_clean(text):
    """Basic text cleaning: fix unicode and unescape HTML entities."""
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

function `ultralytics.models.sam.sam3.tokenizer_ve.whitespace_clean`

def whitespace_clean(text)

Remove redundant whitespace.

Args

Name	Type	Description	Default
`text`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def whitespace_clean(text):
    """Remove redundant whitespace."""
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_canonicalize`

def _clean_canonicalize(x)

Clean text and canonicalize it.

Args

Name	Type	Description	Default
`x`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def _clean_canonicalize(x):
    """Clean text and canonicalize it."""
    # basic, remove whitespace, remove punctuation, lower case
    return canonicalize_text(basic_clean(x))

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_lower`

def _clean_lower(x)

Clean text and return lowercase.

Args

Name	Type	Description	Default
`x`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def _clean_lower(x):
    """Clean text and return lowercase."""
    # basic, remove whitespace, lower case
    return whitespace_clean(basic_clean(x)).lower()

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_whitespace`

def _clean_whitespace(x)

Clean text and remove redundant whitespace.

Args

Name	Type	Description	Default
`x`			required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def _clean_whitespace(x):
    """Clean text and remove redundant whitespace."""
    # basic, remove whitespace
    return whitespace_clean(basic_clean(x))

function `ultralytics.models.sam.sam3.tokenizer_ve.get_clean_fn`

def get_clean_fn(type: str)

Get text cleaning function by name.

Args

Name	Type	Description	Default
`type`	`str`		required

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def get_clean_fn(type: str):
    """Get text cleaning function by name."""
    if type == "canonicalize":
        return _clean_canonicalize
    elif type == "lower":
        return _clean_lower
    elif type == "whitespace":
        return _clean_whitespace
    else:
        assert False, f"Invalid clean function ({type})."

function `ultralytics.models.sam.sam3.tokenizer_ve.canonicalize_text`

def canonicalize_text(text, *, keep_punctuation_exact_string = None)

Returns canonicalized text (lowercase and punctuation removed). From:

https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94.

Args

Name	Type	Description	Default
`text`		string to be canonicalized.	required
`keep_punctuation_exact_string`		If provided, then this exact string kept. For example providing '{}' will keep any occurrences of '{}' (but will still remove '{' and '}' that appear separately).	`None`

Source code in ultralytics/models/sam/sam3/tokenizer_ve.py

View on GitHub

def canonicalize_text(text, *, keep_punctuation_exact_string=None):
    """Returns canonicalized `text` (lowercase and punctuation removed). From:
    https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94.

    Args:
        text: string to be canonicalized.
        keep_punctuation_exact_string: If provided, then this exact string kept. For example providing '{}' will keep
            any occurrences of '{}' (but will still remove '{' and '}' that appear separately).
    """
    text = text.replace("_", " ")
    if keep_punctuation_exact_string:
        text = keep_punctuation_exact_string.join(
            part.translate(str.maketrans("", "", string.punctuation))
            for part in text.split(keep_punctuation_exact_string)
        )
    else:
        text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

📅 Created 0 days ago ✏️ Updated 0 days ago

Reference for ultralytics/models/sam/sam3/tokenizer_ve.py

class ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer

method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.__call__

method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.bpe

method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.decode

method ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.encode

function ultralytics.models.sam.sam3.tokenizer_ve.bytes_to_unicode

function ultralytics.models.sam.sam3.tokenizer_ve.get_pairs

function ultralytics.models.sam.sam3.tokenizer_ve.basic_clean

function ultralytics.models.sam.sam3.tokenizer_ve.whitespace_clean

function ultralytics.models.sam.sam3.tokenizer_ve._clean_canonicalize

function ultralytics.models.sam.sam3.tokenizer_ve._clean_lower

function ultralytics.models.sam.sam3.tokenizer_ve._clean_whitespace

function ultralytics.models.sam.sam3.tokenizer_ve.get_clean_fn

function ultralytics.models.sam.sam3.tokenizer_ve.canonicalize_text

Reference for `ultralytics/models/sam/sam3/tokenizer_ve.py`

class `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer`

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.call`

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.bpe`

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.decode`

method `ultralytics.models.sam.sam3.tokenizer_ve.SimpleTokenizer.encode`

function `ultralytics.models.sam.sam3.tokenizer_ve.bytes_to_unicode`

function `ultralytics.models.sam.sam3.tokenizer_ve.get_pairs`

function `ultralytics.models.sam.sam3.tokenizer_ve.basic_clean`

function `ultralytics.models.sam.sam3.tokenizer_ve.whitespace_clean`

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_canonicalize`

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_lower`

function `ultralytics.models.sam.sam3.tokenizer_ve._clean_whitespace`

function `ultralytics.models.sam.sam3.tokenizer_ve.get_clean_fn`

function `ultralytics.models.sam.sam3.tokenizer_ve.canonicalize_text`