Skip to content

Reference for ultralytics/nn/text_model.py

Note

This file is available at https://github.com/ultralytics/ultralytics/blob/main/ultralytics/nn/text_model.py. If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!


ultralytics.nn.text_model.TextModel

TextModel()

Bases: Module

Abstract base class for text encoding models.

This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement the tokenize and encode_text methods.

Methods:

Name Description
tokenize

Convert input texts to tokens.

encode_text

Encode tokenized texts into feature vectors.

Source code in ultralytics/nn/text_model.py
def __init__(self):
    """Initialize the TextModel base class."""
    super().__init__()

encode_text abstractmethod

encode_text(texts, dtype)

Encode tokenized texts into normalized feature vectors.

Source code in ultralytics/nn/text_model.py
@abstractmethod
def encode_text(texts, dtype):
    """Encode tokenized texts into normalized feature vectors."""
    pass

tokenize abstractmethod

tokenize(texts)

Convert input texts to tokens for model processing.

Source code in ultralytics/nn/text_model.py
@abstractmethod
def tokenize(texts):
    """Convert input texts to tokens for model processing."""
    pass





ultralytics.nn.text_model.CLIP

CLIP(size, device)

Bases: TextModel

OpenAI CLIP text encoder implementation.

This class implements the TextModel interface using OpenAI's CLIP model for text encoding.

Attributes:

Name Type Description
model CLIP

The loaded CLIP model.

device device

Device where the model is loaded.

Methods:

Name Description
tokenize

Convert input texts to CLIP tokens.

encode_text

Encode tokenized texts into normalized feature vectors.

Parameters:

Name Type Description Default
size str

Model size identifier (e.g., 'ViT-B/32').

required
device device

Device to load the model on.

required
Source code in ultralytics/nn/text_model.py
def __init__(self, size, device):
    """
    Initialize the CLIP text encoder.

    Args:
        size (str): Model size identifier (e.g., 'ViT-B/32').
        device (torch.device): Device to load the model on.
    """
    super().__init__()
    self.model = clip.load(size, device=device)[0]
    self.to(device)
    self.device = device
    self.eval()

encode_text

encode_text(texts, dtype=torch.float32)

Encode tokenized texts into normalized feature vectors.

Parameters:

Name Type Description Default
texts Tensor

Tokenized text inputs.

required
dtype dtype

Data type for output features.

float32

Returns:

Type Description
Tensor

Normalized text feature vectors.

Source code in ultralytics/nn/text_model.py
@smart_inference_mode()
def encode_text(self, texts, dtype=torch.float32):
    """
    Encode tokenized texts into normalized feature vectors.

    Args:
        texts (torch.Tensor): Tokenized text inputs.
        dtype (torch.dtype): Data type for output features.

    Returns:
        (torch.Tensor): Normalized text feature vectors.
    """
    txt_feats = self.model.encode_text(texts).to(dtype)
    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
    return txt_feats

tokenize

tokenize(texts)

Convert input texts to CLIP tokens.

Source code in ultralytics/nn/text_model.py
def tokenize(self, texts):
    """Convert input texts to CLIP tokens."""
    return clip.tokenize(texts).to(self.device)





ultralytics.nn.text_model.MobileCLIP

MobileCLIP(size, device)

Bases: TextModel

Apple MobileCLIP text encoder implementation.

This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.

Attributes:

Name Type Description
model MobileCLIP

The loaded MobileCLIP model.

tokenizer callable

Tokenizer function for processing text inputs.

device device

Device where the model is loaded.

config_size_map dict

Mapping from size identifiers to model configuration names.

Methods:

Name Description
tokenize

Convert input texts to MobileCLIP tokens.

encode_text

Encode tokenized texts into normalized feature vectors.

Parameters:

Name Type Description Default
size str

Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').

required
device device

Device to load the model on.

required
Source code in ultralytics/nn/text_model.py
def __init__(self, size, device):
    """
    Initialize the MobileCLIP text encoder.

    Args:
        size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
        device (torch.device): Device to load the model on.
    """
    super().__init__()
    config = self.config_size_map[size]
    file = f"mobileclip_{size}.pt"
    if not Path(file).is_file():
        from ultralytics import download

        download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
    self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
    self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
    self.to(device)
    self.device = device
    self.eval()

encode_text

encode_text(texts, dtype=torch.float32)

Encode tokenized texts into normalized feature vectors.

Parameters:

Name Type Description Default
texts Tensor

Tokenized text inputs.

required
dtype dtype

Data type for output features.

float32

Returns:

Type Description
Tensor

Normalized text feature vectors.

Source code in ultralytics/nn/text_model.py
@smart_inference_mode()
def encode_text(self, texts, dtype=torch.float32):
    """
    Encode tokenized texts into normalized feature vectors.

    Args:
        texts (torch.Tensor): Tokenized text inputs.
        dtype (torch.dtype): Data type for output features.

    Returns:
        (torch.Tensor): Normalized text feature vectors.
    """
    text_features = self.model.encode_text(texts).to(dtype)
    text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
    return text_features

tokenize

tokenize(texts)

Convert input texts to MobileCLIP tokens.

Source code in ultralytics/nn/text_model.py
def tokenize(self, texts):
    """Convert input texts to MobileCLIP tokens."""
    return self.tokenizer(texts).to(self.device)





ultralytics.nn.text_model.build_text_model

build_text_model(variant, device=None)

Build a text encoding model based on the specified variant.

Parameters:

Name Type Description Default
variant str

Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").

required
device device

Device to load the model on.

None

Returns:

Type Description
TextModel

Instantiated text encoding model.

Raises:

Type Description
AssertionError

If the specified variant is not supported.

Source code in ultralytics/nn/text_model.py
def build_text_model(variant, device=None):
    """
    Build a text encoding model based on the specified variant.

    Args:
        variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
        device (torch.device, optional): Device to load the model on.

    Returns:
        (TextModel): Instantiated text encoding model.

    Raises:
        AssertionError: If the specified variant is not supported.
    """
    LOGGER.info(f"Build text model {variant}")
    base, size = variant.split(":")
    if base == "clip":
        return CLIP(size, device)
    elif base == "mobileclip":
        return MobileCLIP(size, device)
    else:
        print("Variant not found")
        assert False



📅 Created 1 day ago ✏️ Updated 1 day ago