Reference for ultralytics/models/sam/sam3/text_encoder_ve.py
Improvements
This page is sourced from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/sam/sam3/text_encoder_ve.py. Have an improvement or example to add? Open a Pull Request — thank you! 🙏
Summary
class ultralytics.models.sam.sam3.text_encoder_ve.ResidualAttentionBlock
def __init__(
self,
d_model: int,
n_head: int,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
act_layer: Callable[[], nn.Module] = nn.GELU,
norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
)
Bases: nn.Module
Transformer block with multi-head attention, layer normalization, and MLP feed-forward network.
Args
| Name | Type | Description | Default |
|---|---|---|---|
d_model | int | required | |
n_head | int | required | |
mlp_ratio | float | 4.0 | |
ls_init_value | float | None | None | |
act_layer | Callable[[], nn.Module] | nn.GELU | |
norm_layer | Callable[[int], nn.Module] | nn.LayerNorm |
Methods
| Name | Description |
|---|---|
attention | Compute multi-head attention with optional cross-attention support and masking. |
forward | Apply residual attention with layer normalization and MLP, supporting optional cross-attention. |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubclass ResidualAttentionBlock(nn.Module):
"""Transformer block with multi-head attention, layer normalization, and MLP feed-forward network."""
def __init__(
self,
d_model: int,
n_head: int,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
act_layer: Callable[[], nn.Module] = nn.GELU,
norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
):
"""Initialize residual attention block with configurable dimensions and normalization."""
super().__init__()
# Attention
self.attn = nn.MultiheadAttention(d_model, n_head, batch_first=True)
# LayerNorm, LayerScale
self.ln_1 = norm_layer(d_model)
self.ln_2 = norm_layer(d_model)
self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
# MLP
mlp_width = int(d_model * mlp_ratio)
self.mlp = nn.Sequential(
OrderedDict(
[
("c_fc", nn.Linear(d_model, mlp_width)),
("gelu", act_layer()),
("c_proj", nn.Linear(mlp_width, d_model)),
]
)
)
method ultralytics.models.sam.sam3.text_encoder_ve.ResidualAttentionBlock.attention
def attention(
self, q_x: torch.Tensor, k_x: torch.Tensor = None, v_x: torch.Tensor = None, attn_mask: torch.Tensor = None
) -> torch.Tensor
Compute multi-head attention with optional cross-attention support and masking.
Args
| Name | Type | Description | Default |
|---|---|---|---|
q_x | torch.Tensor | required | |
k_x | torch.Tensor | None | |
v_x | torch.Tensor | None | |
attn_mask | torch.Tensor | None |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef attention(
self, q_x: torch.Tensor, k_x: torch.Tensor = None, v_x: torch.Tensor = None, attn_mask: torch.Tensor = None
) -> torch.Tensor:
"""Compute multi-head attention with optional cross-attention support and masking."""
k_x = k_x if k_x is not None else q_x
v_x = v_x if v_x is not None else q_x
if attn_mask is not None:
# Leave boolean masks as is
if not attn_mask.dtype == torch.bool:
attn_mask = attn_mask.to(q_x.dtype)
return self.attn(q_x, k_x, v_x, need_weights=False, attn_mask=attn_mask)[0]
method ultralytics.models.sam.sam3.text_encoder_ve.ResidualAttentionBlock.forward
def forward(
self, q_x: torch.Tensor, k_x: torch.Tensor = None, v_x: torch.Tensor = None, attn_mask: torch.Tensor = None
) -> torch.Tensor
Apply residual attention with layer normalization and MLP, supporting optional cross-attention.
Args
| Name | Type | Description | Default |
|---|---|---|---|
q_x | torch.Tensor | required | |
k_x | torch.Tensor | None | |
v_x | torch.Tensor | None | |
attn_mask | torch.Tensor | None |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef forward(
self, q_x: torch.Tensor, k_x: torch.Tensor = None, v_x: torch.Tensor = None, attn_mask: torch.Tensor = None
) -> torch.Tensor:
"""Apply residual attention with layer normalization and MLP, supporting optional cross-attention."""
k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
x = x + self.ls_2(self.mlp(self.ln_2(x)))
return x
class ultralytics.models.sam.sam3.text_encoder_ve.Transformer
def __init__(
self,
width: int,
layers: int,
heads: int,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
act_layer: Callable[[], nn.Module] = nn.GELU,
norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
compile_mode: str | None = None,
use_act_checkpoint: bool = False,
)
Bases: nn.Module
Stack of residual attention blocks forming a transformer encoder with optional gradient checkpointing.
Args
| Name | Type | Description | Default |
|---|---|---|---|
width | int | required | |
layers | int | required | |
heads | int | required | |
mlp_ratio | float | 4.0 | |
ls_init_value | float | None | None | |
act_layer | Callable[[], nn.Module] | nn.GELU | |
norm_layer | Callable[[int], nn.Module] | nn.LayerNorm | |
compile_mode | str | None | None | |
use_act_checkpoint | bool | False |
Methods
| Name | Description |
|---|---|
forward | Process input through all transformer blocks with optional gradient checkpointing during training. |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubclass Transformer(nn.Module):
"""Stack of residual attention blocks forming a transformer encoder with optional gradient checkpointing."""
def __init__(
self,
width: int,
layers: int,
heads: int,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
act_layer: Callable[[], nn.Module] = nn.GELU,
norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
compile_mode: str | None = None,
use_act_checkpoint: bool = False,
):
"""Initialize transformer with configurable depth, width, and optional compilation/checkpointing."""
super().__init__()
self.width = width
self.layers = layers
self.grad_checkpointing = use_act_checkpoint
self.resblocks = nn.ModuleList(
[
ResidualAttentionBlock(
width,
heads,
mlp_ratio,
ls_init_value=ls_init_value,
act_layer=act_layer,
norm_layer=norm_layer,
)
for _ in range(layers)
]
)
if compile_mode is not None:
self.forward = torch.compile(self.forward, mode=compile_mode, fullgraph=True)
if self.grad_checkpointing:
torch._dynamo.config.optimize_ddp = False
method ultralytics.models.sam.sam3.text_encoder_ve.Transformer.forward
def forward(self, x: torch.Tensor, attn_mask: torch.Tensor = None) -> torch.Tensor
Process input through all transformer blocks with optional gradient checkpointing during training.
Args
| Name | Type | Description | Default |
|---|---|---|---|
x | torch.Tensor | required | |
attn_mask | torch.Tensor | None |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef forward(self, x: torch.Tensor, attn_mask: torch.Tensor = None) -> torch.Tensor:
"""Process input through all transformer blocks with optional gradient checkpointing during training."""
for _, r in enumerate(self.resblocks):
if self.grad_checkpointing and not torch.jit.is_scripting() and self.training:
x = checkpoint(r, x, None, None, attn_mask, use_reentrant=False)
else:
x = r(x, attn_mask=attn_mask)
return x
class ultralytics.models.sam.sam3.text_encoder_ve.TextTransformer
def __init__(
self,
context_length: int = 77,
vocab_size: int = 49408,
width: int = 512,
heads: int = 8,
layers: int = 12,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
output_dim: int = 512,
no_causal_mask: bool = False,
pool_type: str = "none", # no pooling
proj_bias: bool = False,
act_layer: Callable = nn.GELU,
norm_layer: Callable = nn.LayerNorm,
output_tokens: bool = False,
use_ln_post: bool = True,
compile_mode: str | None = None,
use_act_checkpoint: bool = False,
)
Bases: nn.Module
Text transformer encoder with causal masking and flexible pooling strategies.
Args
| Name | Type | Description | Default |
|---|---|---|---|
context_length | int | 77 | |
vocab_size | int | 49408 | |
width | int | 512 | |
heads | int | 8 | |
layers | int | 12 | |
mlp_ratio | float | 4.0 | |
ls_init_value | float | None | None | |
output_dim | int | 512 | |
no_causal_mask | bool | False | |
pool_type | str | "none" | |
proj_bias | bool | False | |
act_layer | Callable | nn.GELU | |
norm_layer | Callable | nn.LayerNorm | |
output_tokens | bool | False | |
use_ln_post | bool | True | |
compile_mode | str | None | None | |
use_act_checkpoint | bool | False |
Methods
| Name | Description |
|---|---|
build_causal_mask | Create a causal attention mask to prevent attention to future tokens. |
forward | Forward pass through the text transformer, returning pooled output and optionally token embeddings. |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubclass TextTransformer(nn.Module):
"""Text transformer encoder with causal masking and flexible pooling strategies."""
def __init__(
self,
context_length: int = 77,
vocab_size: int = 49408,
width: int = 512,
heads: int = 8,
layers: int = 12,
mlp_ratio: float = 4.0,
ls_init_value: float | None = None,
output_dim: int = 512,
no_causal_mask: bool = False,
pool_type: str = "none", # no pooling
proj_bias: bool = False,
act_layer: Callable = nn.GELU,
norm_layer: Callable = nn.LayerNorm,
output_tokens: bool = False,
use_ln_post: bool = True,
compile_mode: str | None = None,
use_act_checkpoint: bool = False,
):
"""Initialize text transformer with embedding layers, transformer blocks, and pooling options."""
super().__init__()
assert pool_type in ("first", "last", "argmax", "none")
self.output_tokens = output_tokens
self.num_pos = self.context_length = context_length
self.vocab_size = vocab_size
self.width = width
self.output_dim = output_dim
self.heads = heads
self.pool_type = pool_type
self.token_embedding = nn.Embedding(self.vocab_size, width)
self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
self.transformer = Transformer(
width=width,
layers=layers,
heads=heads,
mlp_ratio=mlp_ratio,
ls_init_value=ls_init_value,
act_layer=act_layer,
norm_layer=norm_layer,
compile_mode=compile_mode,
use_act_checkpoint=use_act_checkpoint,
)
self.ln_final = norm_layer(width) if use_ln_post else nn.Identity()
if no_causal_mask:
self.attn_mask = None
else:
self.register_buffer("attn_mask", self.build_causal_mask(), persistent=False)
if proj_bias:
self.text_projection = nn.Linear(width, output_dim)
else:
self.text_projection = nn.Parameter(torch.empty(width, output_dim))
method ultralytics.models.sam.sam3.text_encoder_ve.TextTransformer.build_causal_mask
def build_causal_mask(self) -> torch.Tensor
Create a causal attention mask to prevent attention to future tokens.
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef build_causal_mask(self) -> torch.Tensor:
"""Create a causal attention mask to prevent attention to future tokens."""
# lazily create causal attention mask, with full attention between the tokens
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.num_pos, self.num_pos)
mask.fill_(float("-inf"))
mask.triu_(1) # zero out the lower diagonal
return mask
method ultralytics.models.sam.sam3.text_encoder_ve.TextTransformer.forward
def forward(self, text: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]
Forward pass through the text transformer, returning pooled output and optionally token embeddings.
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | torch.Tensor | required |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef forward(self, text: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
"""Forward pass through the text transformer, returning pooled output and optionally token embeddings."""
seq_len = text.shape[1]
x = self.token_embedding(text) # [batch_size, n_ctx, d_model]
attn_mask = self.attn_mask
if attn_mask is not None:
attn_mask = attn_mask[:seq_len, :seq_len]
x = x + self.positional_embedding[:seq_len]
x = self.transformer(x, attn_mask=attn_mask)
x = self.ln_final(x)
pooled, tokens = text_global_pool(x, text, pool_type=self.pool_type)
if self.text_projection is not None:
if isinstance(self.text_projection, nn.Linear):
pooled = self.text_projection(pooled)
else:
pooled = pooled @ self.text_projection
if self.output_tokens:
return pooled, tokens
return pooled
class ultralytics.models.sam.sam3.text_encoder_ve.VETextEncoder
def __init__(
self,
d_model: int,
tokenizer: Callable,
width: int = 1024,
heads: int = 16,
layers: int = 24,
context_length: int = 32,
vocab_size: int = 49408,
use_ln_post: bool = True,
compile_mode: str | None = None,
use_act_checkpoint: bool = True,
)
Bases: nn.Module
Text encoder for Vision Encoder (VE) models, combining a text transformer and a linear resizer.
Args
| Name | Type | Description | Default |
|---|---|---|---|
d_model | int | required | |
tokenizer | Callable | required | |
width | int | 1024 | |
heads | int | 16 | |
layers | int | 24 | |
context_length | int | 32 | |
vocab_size | int | 49408 | |
use_ln_post | bool | True | |
compile_mode | str | None | None | |
use_act_checkpoint | bool | True |
Methods
| Name | Description |
|---|---|
forward | Encode text input, either raw strings or pre-encoded tensors, and resize to match decoder dimensions. |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubclass VETextEncoder(nn.Module):
"""Text encoder for Vision Encoder (VE) models, combining a text transformer and a linear resizer."""
def __init__(
self,
d_model: int,
tokenizer: Callable,
width: int = 1024,
heads: int = 16,
layers: int = 24,
context_length: int = 32,
vocab_size: int = 49408,
use_ln_post: bool = True,
compile_mode: str | None = None,
use_act_checkpoint: bool = True,
):
"""Initialize VE text encoder with a text transformer and a linear resizer to match decoder dimensions."""
super().__init__()
self.context_length = context_length
self.use_ln_post = use_ln_post
self.tokenizer = tokenizer
self.encoder = TextTransformer(
context_length=self.context_length,
vocab_size=vocab_size,
width=width,
heads=heads,
layers=layers,
# we want the tokens, not just the pooled output
output_tokens=True,
use_ln_post=use_ln_post,
compile_mode=compile_mode,
use_act_checkpoint=use_act_checkpoint,
)
self.resizer = nn.Linear(self.encoder.width, d_model)
method ultralytics.models.sam.sam3.text_encoder_ve.VETextEncoder.forward
def forward(
self, text: list[str] | tuple[torch.Tensor, torch.Tensor, dict], input_boxes: list | None = None
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]
Encode text input, either raw strings or pre-encoded tensors, and resize to match decoder dimensions.
Args
| Name | Type | Description | Default |
|---|---|---|---|
text | list[str] | tuple[torch.Tensor, torch.Tensor, dict] | required | |
input_boxes | list | None | None |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef forward(
self, text: list[str] | tuple[torch.Tensor, torch.Tensor, dict], input_boxes: list | None = None
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Encode text input, either raw strings or pre-encoded tensors, and resize to match decoder dimensions."""
if isinstance(text[0], str):
# no use case for this
assert input_boxes is None or len(input_boxes) == 0, "not supported"
# Encode the text
tokenized = self.tokenizer(text, context_length=self.context_length).to(
self.resizer.weight.device
) # [b, seq_len]
text_attention_mask = (tokenized != 0).bool()
# manually embed the tokens
inputs_embeds = self.encoder.token_embedding(tokenized) # [b, seq_len, d=1024]
_, text_memory = self.encoder(tokenized) # [b, seq_len, d=1024]
assert text_memory.shape[1] == inputs_embeds.shape[1]
# Invert attention mask because its the opposite in pytorch transformer
text_attention_mask = text_attention_mask.ne(1)
# Transpose memory because pytorch's attention expects sequence first
text_memory = text_memory.transpose(0, 1)
# Resize the encoder hidden states to be of the same d_model as the decoder
text_memory_resized = self.resizer(text_memory)
else:
# The text is already encoded, use as is.
text_attention_mask, text_memory_resized, tokenized = text
inputs_embeds = tokenized["inputs_embeds"]
assert input_boxes is None or len(input_boxes) == 0, "Can't replace boxes in text if it's already encoded"
# Note that the input_embeds are returned in pytorch's convention (sequence first)
return (
text_attention_mask,
text_memory_resized,
inputs_embeds.transpose(0, 1),
)
function ultralytics.models.sam.sam3.text_encoder_ve.text_global_pool
def text_global_pool(
x: torch.Tensor, text: torch.Tensor = None, pool_type: str = "argmax"
) -> tuple[torch.Tensor, torch.Tensor]
Extract pooled representation and tokens from text embeddings using specified pooling strategy
(first/last/argmax/none).
Args
| Name | Type | Description | Default |
|---|---|---|---|
x | torch.Tensor | required | |
text | torch.Tensor | None | |
pool_type | str | "argmax" |
Source code in ultralytics/models/sam/sam3/text_encoder_ve.py
View on GitHubdef text_global_pool(
x: torch.Tensor, text: torch.Tensor = None, pool_type: str = "argmax"
) -> tuple[torch.Tensor, torch.Tensor]:
"""Extract pooled representation and tokens from text embeddings using specified pooling strategy
(first/last/argmax/none).
"""
if pool_type == "first":
pooled, tokens = x[:, 0], x[:, 1:]
elif pool_type == "last":
pooled, tokens = x[:, -1], x[:, :-1]
elif pool_type == "argmax":
# take features from the eot embedding (eot_token is the highest number in each sequence)
assert text is not None
pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x
else:
pooled = tokens = x
return pooled, tokens