सामग्री पर जाएं

के लिए संदर्भ ultralytics/nn/modules/transformer.py

नोट

यह फ़ाइल यहाँ उपलब्ध है https://github.com/ultralytics/ultralytics/बूँद/मुख्य/ultralytics/nn/modules/ट्रांसफार्मर.py. यदि आप कोई समस्या देखते हैं तो कृपया पुल अनुरोध का योगदान करके इसे ठीक करने में मदद करें 🛠️। 🙏 धन्यवाद !



ultralytics.nn.modules.transformer.TransformerEncoderLayer

का रूप: Module

ट्रांसफार्मर एनकोडर की एक परत को परिभाषित करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 5354555657 58596061 62 63 64 65 66 67 68 69 70 71 72 73 747576777879808182 83
class TransformerEncoderLayer(nn.Module):
    """Defines a single layer of the transformer encoder."""

    def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
        """Initialize the TransformerEncoderLayer with specified parameters."""
        super().__init__()
        from ...utils.torch_utils import TORCH_1_9

        if not TORCH_1_9:
            raise ModuleNotFoundError(
                "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
            )
        self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
        # Implementation of Feedforward model
        self.fc1 = nn.Linear(c1, cm)
        self.fc2 = nn.Linear(cm, c1)

        self.norm1 = nn.LayerNorm(c1)
        self.norm2 = nn.LayerNorm(c1)
        self.dropout = nn.Dropout(dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.act = act
        self.normalize_before = normalize_before

    @staticmethod
    def with_pos_embed(tensor, pos=None):
        """Add position embeddings to the tensor if provided."""
        return tensor if pos is None else tensor + pos

    def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Performs forward pass with post-normalization."""
        q = k = self.with_pos_embed(src, pos)
        src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
        src = src + self.dropout2(src2)
        return self.norm2(src)

    def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Performs forward pass with pre-normalization."""
        src2 = self.norm1(src)
        q = k = self.with_pos_embed(src2, pos)
        src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src2 = self.norm2(src)
        src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
        return src + self.dropout2(src2)

    def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Forward propagates the input through the encoder module."""
        if self.normalize_before:
            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
        return self.forward_post(src, src_mask, src_key_padding_mask, pos)

__init__(c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False)

निर्दिष्ट मापदंडों के साथ TransformerEncoderLayer को प्रारंभ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
    """Initialize the TransformerEncoderLayer with specified parameters."""
    super().__init__()
    from ...utils.torch_utils import TORCH_1_9

    if not TORCH_1_9:
        raise ModuleNotFoundError(
            "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
        )
    self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
    # Implementation of Feedforward model
    self.fc1 = nn.Linear(c1, cm)
    self.fc2 = nn.Linear(cm, c1)

    self.norm1 = nn.LayerNorm(c1)
    self.norm2 = nn.LayerNorm(c1)
    self.dropout = nn.Dropout(dropout)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)

    self.act = act
    self.normalize_before = normalize_before

forward(src, src_mask=None, src_key_padding_mask=None, pos=None)

फॉरवर्ड एनकोडर मॉड्यूल के माध्यम से इनपुट का प्रचार करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
    """Forward propagates the input through the encoder module."""
    if self.normalize_before:
        return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
    return self.forward_post(src, src_mask, src_key_padding_mask, pos)

forward_post(src, src_mask=None, src_key_padding_mask=None, pos=None)

सामान्यीकरण के बाद के साथ फॉरवर्ड पास करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
    """Performs forward pass with post-normalization."""
    q = k = self.with_pos_embed(src, pos)
    src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
    src = src + self.dropout1(src2)
    src = self.norm1(src)
    src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
    src = src + self.dropout2(src2)
    return self.norm2(src)

forward_pre(src, src_mask=None, src_key_padding_mask=None, pos=None)

पूर्व-सामान्यीकरण के साथ आगे पास करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
    """Performs forward pass with pre-normalization."""
    src2 = self.norm1(src)
    q = k = self.with_pos_embed(src2, pos)
    src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
    src = src + self.dropout1(src2)
    src2 = self.norm2(src)
    src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
    return src + self.dropout2(src2)

with_pos_embed(tensor, pos=None) staticmethod

स्थिति एम्बेडिंग जोड़ें tensor यदि प्रदान किया गया है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
@staticmethod
def with_pos_embed(tensor, pos=None):
    """Add position embeddings to the tensor if provided."""
    return tensor if pos is None else tensor + pos



ultralytics.nn.modules.transformer.AIFI

का रूप: TransformerEncoderLayer

एआईएफआई ट्रांसफार्मर परत को परिभाषित करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
86 87 88  89 90 91 92 93 94 95 96 97  98  99  100  101 102 103 104 105  106  107 108109  110 111 112 113 114 115 
class AIFI(TransformerEncoderLayer):
    """Defines the AIFI transformer layer."""

    def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
        """Initialize the AIFI instance with specified parameters."""
        super().__init__(c1, cm, num_heads, dropout, act, normalize_before)

    def forward(self, x):
        """Forward pass for the AIFI transformer layer."""
        c, h, w = x.shape[1:]
        pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
        # Flatten [B, C, H, W] to [B, HxW, C]
        x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
        return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()

    @staticmethod
    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
        """Builds 2D sine-cosine position embedding."""
        assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
        grid_w = torch.arange(w, dtype=torch.float32)
        grid_h = torch.arange(h, dtype=torch.float32)
        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
        pos_dim = embed_dim // 4
        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
        omega = 1.0 / (temperature**omega)

        out_w = grid_w.flatten()[..., None] @ omega[None]
        out_h = grid_h.flatten()[..., None] @ omega[None]

        return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]

__init__(c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False)

निर्दिष्ट मापदंडों के साथ एआईएफआई उदाहरण को इनिशियलाइज़ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
    """Initialize the AIFI instance with specified parameters."""
    super().__init__(c1, cm, num_heads, dropout, act, normalize_before)

build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0) staticmethod

2D साइन-कोसाइन स्थिति एम्बेडिंग बनाता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
101 102 103 104 105 106 107 108 109 110 111 112 113114115
@staticmethod
def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
    """Builds 2D sine-cosine position embedding."""
    assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
    grid_w = torch.arange(w, dtype=torch.float32)
    grid_h = torch.arange(h, dtype=torch.float32)
    grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
    pos_dim = embed_dim // 4
    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
    omega = 1.0 / (temperature**omega)

    out_w = grid_w.flatten()[..., None] @ omega[None]
    out_h = grid_h.flatten()[..., None] @ omega[None]

    return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]

forward(x)

एआईएफआई ट्रांसफार्मर परत के लिए फॉरवर्ड पास।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def forward(self, x):
    """Forward pass for the AIFI transformer layer."""
    c, h, w = x.shape[1:]
    pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
    # Flatten [B, C, H, W] to [B, HxW, C]
    x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
    return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()



ultralytics.nn.modules.transformer.TransformerLayer

का रूप: Module

ट्रांसफार्मर परत https://arxiv.org/abs/2010.11929 (बेहतर प्रदर्शन के लिए लेयरनॉर्म परतों को हटा दिया गया)।

में स्रोत कोड ultralytics/nn/modules/transformer.py
118 119 120 121 122 123 124 125 126 127 128129 130131 132 133 134
class TransformerLayer(nn.Module):
    """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""

    def __init__(self, c, num_heads):
        """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
        super().__init__()
        self.q = nn.Linear(c, c, bias=False)
        self.k = nn.Linear(c, c, bias=False)
        self.v = nn.Linear(c, c, bias=False)
        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
        self.fc1 = nn.Linear(c, c, bias=False)
        self.fc2 = nn.Linear(c, c, bias=False)

    def forward(self, x):
        """Apply a transformer block to the input x and return the output."""
        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
        return self.fc2(self.fc1(x)) + x

__init__(c, num_heads)

रैखिक परिवर्तनों और बहु-सिर ध्यान का उपयोग करके एक आत्म-ध्यान तंत्र को प्रारंभ करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
121 122 123 124 125 126 127 128 129
def __init__(self, c, num_heads):
    """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
    super().__init__()
    self.q = nn.Linear(c, c, bias=False)
    self.k = nn.Linear(c, c, bias=False)
    self.v = nn.Linear(c, c, bias=False)
    self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
    self.fc1 = nn.Linear(c, c, bias=False)
    self.fc2 = nn.Linear(c, c, bias=False)

forward(x)

इनपुट x पर एक ट्रांसफार्मर ब्लॉक लागू करें और आउटपुट वापस करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
131 132 133 134
def forward(self, x):
    """Apply a transformer block to the input x and return the output."""
    x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
    return self.fc2(self.fc1(x)) + x



ultralytics.nn.modules.transformer.TransformerBlock

का रूप: Module

दृष्टि ट्रांसफार्मर https://arxiv.org/abs/2010.11929।

में स्रोत कोड ultralytics/nn/modules/transformer.py
137 138 139 140 141 142 143 144 145 146 147 148149 150151 152 153 154 155 156
class TransformerBlock(nn.Module):
    """Vision Transformer https://arxiv.org/abs/2010.11929."""

    def __init__(self, c1, c2, num_heads, num_layers):
        """Initialize a Transformer module with position embedding and specified number of heads and layers."""
        super().__init__()
        self.conv = None
        if c1 != c2:
            self.conv = Conv(c1, c2)
        self.linear = nn.Linear(c2, c2)  # learnable position embedding
        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
        self.c2 = c2

    def forward(self, x):
        """Forward propagates the input through the bottleneck module."""
        if self.conv is not None:
            x = self.conv(x)
        b, _, w, h = x.shape
        p = x.flatten(2).permute(2, 0, 1)
        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)

__init__(c1, c2, num_heads, num_layers)

स्थिति एम्बेडिंग और सिर और परतों की निर्दिष्ट संख्या के साथ एक ट्रांसफार्मर मॉड्यूल को प्रारंभ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
140 141 142 143 144 145 146 147 148
def __init__(self, c1, c2, num_heads, num_layers):
    """Initialize a Transformer module with position embedding and specified number of heads and layers."""
    super().__init__()
    self.conv = None
    if c1 != c2:
        self.conv = Conv(c1, c2)
    self.linear = nn.Linear(c2, c2)  # learnable position embedding
    self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
    self.c2 = c2

forward(x)

फॉरवर्ड अड़चन मॉड्यूल के माध्यम से इनपुट का प्रचार करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
150 151 152 153 154 155  156
def forward(self, x):
    """Forward propagates the input through the bottleneck module."""
    if self.conv is not None:
        x = self.conv(x)
    b, _, w, h = x.shape
    p = x.flatten(2).permute(2, 0, 1)
    return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)



ultralytics.nn.modules.transformer.MLPBlock

का रूप: Module

एक बहु-परत परसेप्ट्रॉन के एकल ब्लॉक को लागू करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
159 160 161 162 163 164 165 166 167 168 169170171
class MLPBlock(nn.Module):
    """Implements a single block of a multi-layer perceptron."""

    def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
        """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
        super().__init__()
        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
        self.act = act()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass for the MLPBlock."""
        return self.lin2(self.act(self.lin1(x)))

__init__(embedding_dim, mlp_dim, act=nn.GELU)

निर्दिष्ट एम्बेडिंग आयाम, MLP आयाम और सक्रियण फ़ंक्शन के साथ MLPBlock को प्रारंभ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
162 163 164 165 166167
def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
    """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
    super().__init__()
    self.lin1 = nn.Linear(embedding_dim, mlp_dim)
    self.lin2 = nn.Linear(mlp_dim, embedding_dim)
    self.act = act()

forward(x)

MLPBlock के लिए फॉरवर्ड पास।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Forward pass for the MLPBlock."""
    return self.lin2(self.act(self.lin1(x)))



ultralytics.nn.modules.transformer.MLP

का रूप: Module

एक साधारण बहु-परत परसेप्ट्रॉन (जिसे एफएफएन भी कहा जाता है) को लागू करता है।

में स्रोत कोड ultralytics/nn/modules/transformer.py
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
class MLP(nn.Module):
    """Implements a simple multi-layer perceptron (also called FFN)."""

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        """Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        """Forward pass for the entire MLP."""
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x

__init__(input_dim, hidden_dim, output_dim, num_layers)

निर्दिष्ट इनपुट, छिपे हुए, आउटपुट आयामों और परतों की संख्या के साथ एमएलपी को इनिशियलाइज़ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
    """Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
    super().__init__()
    self.num_layers = num_layers
    h = [hidden_dim] * (num_layers - 1)
    self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

forward(x)

पूरे एमएलपी के लिए फॉरवर्ड पास।

में स्रोत कोड ultralytics/nn/modules/transformer.py
184 185 186 187 188
def forward(self, x):
    """Forward pass for the entire MLP."""
    for i, layer in enumerate(self.layers):
        x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
    return x



ultralytics.nn.modules.transformer.LayerNorm2d

का रूप: Module

Detectron2 और ConvNeXt कार्यान्वयन से प्रेरित 2D लेयर नॉर्मलाइज़ेशन मॉड्यूल।

में मूल कार्यान्वयन https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py और https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py।

में स्रोत कोड ultralytics/nn/modules/transformer.py
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211212213
class LayerNorm2d(nn.Module):
    """
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    Original implementations in
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
    and
    https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
    """

    def __init__(self, num_channels, eps=1e-6):
        """Initialize LayerNorm2d with the given parameters."""
        super().__init__()
        self.weight = nn.Parameter(torch.ones(num_channels))
        self.bias = nn.Parameter(torch.zeros(num_channels))
        self.eps = eps

    def forward(self, x):
        """Perform forward pass for 2D layer normalization."""
        u = x.mean(1, keepdim=True)
        s = (x - u).pow(2).mean(1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.eps)
        return self.weight[:, None, None] * x + self.bias[:, None, None]

__init__(num_channels, eps=1e-06)

दिए गए मापदंडों के साथ LayerNorm2d को इनरिजिनियलाइज़ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
201 202 203 204 205 206
def __init__(self, num_channels, eps=1e-6):
    """Initialize LayerNorm2d with the given parameters."""
    super().__init__()
    self.weight = nn.Parameter(torch.ones(num_channels))
    self.bias = nn.Parameter(torch.zeros(num_channels))
    self.eps = eps

forward(x)

2D परत सामान्यीकरण के लिए फॉरवर्ड पास करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
208 209 210 211 212213
def forward(self, x):
    """Perform forward pass for 2D layer normalization."""
    u = x.mean(1, keepdim=True)
    s = (x - u).pow(2).mean(1, keepdim=True)
    x = (x - u) / torch.sqrt(s + self.eps)
    return self.weight[:, None, None] * x + self.bias[:, None, None]



ultralytics.nn.modules.transformer.MSDeformAttn

का रूप: Module

डिफॉर्मबल-डीईटीआर और पैडलडिटेक्शन कार्यान्वयन के आधार पर मल्टी-स्केल डिफॉर्मेबल अटेंशन मॉड्यूल।

https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py

में स्रोत कोड ultralytics/nn/modules/transformer.py
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297298299 300 301 302  303 304305306307
class MSDeformAttn(nn.Module):
    """
    Multi-Scale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    """

    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
        """Initialize MSDeformAttn with the given parameters."""
        super().__init__()
        if d_model % n_heads != 0:
            raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
        _d_per_head = d_model // n_heads
        # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
        assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"

        self.im2col_step = 64

        self.d_model = d_model
        self.n_levels = n_levels
        self.n_heads = n_heads
        self.n_points = n_points

        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
        self.value_proj = nn.Linear(d_model, d_model)
        self.output_proj = nn.Linear(d_model, d_model)

        self._reset_parameters()

    def _reset_parameters(self):
        """Reset module parameters."""
        constant_(self.sampling_offsets.weight.data, 0.0)
        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        grid_init = (
            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
            .view(self.n_heads, 1, 1, 2)
            .repeat(1, self.n_levels, self.n_points, 1)
        )
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        constant_(self.attention_weights.weight.data, 0.0)
        constant_(self.attention_weights.bias.data, 0.0)
        xavier_uniform_(self.value_proj.weight.data)
        constant_(self.value_proj.bias.data, 0.0)
        xavier_uniform_(self.output_proj.weight.data)
        constant_(self.output_proj.bias.data, 0.0)

    def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
        """
        Perform forward pass for multiscale deformable attention.

        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

        Args:
            query (torch.Tensor): [bs, query_length, C]
            refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area
            value (torch.Tensor): [bs, value_length, C]
            value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

        Returns:
            output (Tensor): [bs, Length_{query}, C]
        """
        bs, len_q = query.shape[:2]
        len_v = value.shape[1]
        assert sum(s[0] * s[1] for s in value_shapes) == len_v

        value = self.value_proj(value)
        if value_mask is not None:
            value = value.masked_fill(value_mask[..., None], float(0))
        value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
        sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
        attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
        attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
        # N, Len_q, n_heads, n_levels, n_points, 2
        num_points = refer_bbox.shape[-1]
        if num_points == 2:
            offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
            add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
            sampling_locations = refer_bbox[:, :, None, :, None, :] + add
        elif num_points == 4:
            add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
            sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
        else:
            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
        output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
        return self.output_proj(output)

__init__(d_model=256, n_levels=4, n_heads=8, n_points=4)

दिए गए मापदंडों के साथ MSDeformAttn को प्रारंभ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238239 240241 242 243244
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
    """Initialize MSDeformAttn with the given parameters."""
    super().__init__()
    if d_model % n_heads != 0:
        raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
    _d_per_head = d_model // n_heads
    # Better to set _d_per_head to a power of 2 which is more efficient in a CUDA implementation
    assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"

    self.im2col_step = 64

    self.d_model = d_model
    self.n_levels = n_levels
    self.n_heads = n_heads
    self.n_points = n_points

    self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
    self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
    self.value_proj = nn.Linear(d_model, d_model)
    self.output_proj = nn.Linear(d_model, d_model)

    self._reset_parameters()

forward(query, refer_bbox, value, value_shapes, value_mask=None)

मल्टीस्केल विकृत ध्यान के लिए फॉरवर्ड पास करें।

https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

पैरामीटर:

नाम प्रकार विवरण: __________ चूक
query Tensor

[बीएस, query_length, सी]

आवश्यक
refer_bbox Tensor

[बीएस, query_length, n_levels, 2], [0, 1] में सीमा, ऊपरी-बाएँ (0,0), नीचे-दाएं (1, 1), पैडिंग क्षेत्र सहित

आवश्यक
value Tensor

[बीएस, value_length, सी]

आवश्यक
value_shapes List

[n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]

आवश्यक
value_mask Tensor

[bs, value_length], गैर-पैडिंग तत्वों के लिए सत्य, पैडिंग तत्वों के लिए गलत

None

देता:

नाम प्रकार विवरण: __________
output Tensor

[बीएस, Length_ {क्वेरी}, सी]

में स्रोत कोड ultralytics/nn/modules/transformer.py
267 268 269 270 271 272 273 274 275 276 277 278279 280 281 282 283 284 285 286 287 288 289290 291 292 293 294 295 296 297 298299 300 301 302 303 304 305 306 307
def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
    """
    Perform forward pass for multiscale deformable attention.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

    Args:
        query (torch.Tensor): [bs, query_length, C]
        refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
            bottom-right (1, 1), including padding area
        value (torch.Tensor): [bs, value_length, C]
        value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
        value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements

    Returns:
        output (Tensor): [bs, Length_{query}, C]
    """
    bs, len_q = query.shape[:2]
    len_v = value.shape[1]
    assert sum(s[0] * s[1] for s in value_shapes) == len_v

    value = self.value_proj(value)
    if value_mask is not None:
        value = value.masked_fill(value_mask[..., None], float(0))
    value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
    sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
    attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
    attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
    # N, Len_q, n_heads, n_levels, n_points, 2
    num_points = refer_bbox.shape[-1]
    if num_points == 2:
        offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
        add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
        sampling_locations = refer_bbox[:, :, None, :, None, :] + add
    elif num_points == 4:
        add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
        sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
    else:
        raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
    output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
    return self.output_proj(output)



ultralytics.nn.modules.transformer.DeformableTransformerDecoderLayer

का रूप: Module

विकृत ट्रांसफार्मर डिकोडर परत पैडलडिटेक्शन और विरूपक-डीईटीआर कार्यान्वयन से प्रेरित है।

https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py

में स्रोत कोड ultralytics/nn/modules/transformer.py
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338339 340 341342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
class DeformableTransformerDecoderLayer(nn.Module):
    """
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    """

    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
        """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
        super().__init__()

        # Self attention
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)

        # Cross attention
        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

        # FFN
        self.linear1 = nn.Linear(d_model, d_ffn)
        self.act = act
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ffn, d_model)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(d_model)

    @staticmethod
    def with_pos_embed(tensor, pos):
        """Add positional embeddings to the input tensor, if provided."""
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        """Perform forward pass through the Feed-Forward Network part of the layer."""
        tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
        tgt = tgt + self.dropout4(tgt2)
        return self.norm3(tgt)

    def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
        """Perform the forward pass through the entire decoder layer."""

        # Self attention
        q = k = self.with_pos_embed(embed, query_pos)
        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
            0
        ].transpose(0, 1)
        embed = embed + self.dropout1(tgt)
        embed = self.norm1(embed)

        # Cross attention
        tgt = self.cross_attn(
            self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
        )
        embed = embed + self.dropout2(tgt)
        embed = self.norm2(embed)

        # FFN
        return self.forward_ffn(embed)

__init__(d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4)

दिए गए मापदंडों के साथ DeformableTransformerDecoderLayer को इनिशियलाइज़ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336337338 
def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
    """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
    super().__init__()

    # Self attention
    self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
    self.dropout1 = nn.Dropout(dropout)
    self.norm1 = nn.LayerNorm(d_model)

    # Cross attention
    self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
    self.dropout2 = nn.Dropout(dropout)
    self.norm2 = nn.LayerNorm(d_model)

    # FFN
    self.linear1 = nn.Linear(d_model, d_ffn)
    self.act = act
    self.dropout3 = nn.Dropout(dropout)
    self.linear2 = nn.Linear(d_ffn, d_model)
    self.dropout4 = nn.Dropout(dropout)
    self.norm3 = nn.LayerNorm(d_model)

forward(embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None)

पूरे डिकोडर परत के माध्यम से आगे पास प्रदर्शन करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368369370
def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
    """Perform the forward pass through the entire decoder layer."""

    # Self attention
    q = k = self.with_pos_embed(embed, query_pos)
    tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[
        0
    ].transpose(0, 1)
    embed = embed + self.dropout1(tgt)
    embed = self.norm1(embed)

    # Cross attention
    tgt = self.cross_attn(
        self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
    )
    embed = embed + self.dropout2(tgt)
    embed = self.norm2(embed)

    # FFN
    return self.forward_ffn(embed)

forward_ffn(tgt)

परत के फीड-फॉरवर्ड नेटवर्क भाग के माध्यम से आगे पास करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
345 346 347 348 349
def forward_ffn(self, tgt):
    """Perform forward pass through the Feed-Forward Network part of the layer."""
    tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
    tgt = tgt + self.dropout4(tgt2)
    return self.norm3(tgt)

with_pos_embed(tensor, pos) staticmethod

इनपुट में स्थितीय एम्बेडिंग जोड़ें tensor, यदि प्रदान किया गया हो।

में स्रोत कोड ultralytics/nn/modules/transformer.py
340 341 342 343
@staticmethod
def with_pos_embed(tensor, pos):
    """Add positional embeddings to the input tensor, if provided."""
    return tensor if pos is None else tensor + pos



ultralytics.nn.modules.transformer.DeformableTransformerDecoder

का रूप: Module

पैडल डिटेक्शन के आधार पर विकृत ट्रांसफार्मर डिकोडर का कार्यान्वयन।

https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

में स्रोत कोड ultralytics/nn/modules/transformer.py
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397398399400 401 402403404405 406 407 408 409 410 411 412 413 414 415 416 417 418419 420 421 422 423 424 425 426
class DeformableTransformerDecoder(nn.Module):
    """
    Implementation of Deformable Transformer Decoder based on PaddleDetection.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    """

    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
        """Initialize the DeformableTransformerDecoder with the given parameters."""
        super().__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

    def forward(
        self,
        embed,  # decoder embeddings
        refer_bbox,  # anchor
        feats,  # image features
        shapes,  # feature shapes
        bbox_head,
        score_head,
        pos_mlp,
        attn_mask=None,
        padding_mask=None,
    ):
        """Perform the forward pass through the entire decoder."""
        output = embed
        dec_bboxes = []
        dec_cls = []
        last_refined_bbox = None
        refer_bbox = refer_bbox.sigmoid()
        for i, layer in enumerate(self.layers):
            output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))

            bbox = bbox_head[i](output)
            refined_bbox = torch.sigmoid(bbox + inverse_sigmoid(refer_bbox))

            if self.training:
                dec_cls.append(score_head[i](output))
                if i == 0:
                    dec_bboxes.append(refined_bbox)
                else:
                    dec_bboxes.append(torch.sigmoid(bbox + inverse_sigmoid(last_refined_bbox)))
            elif i == self.eval_idx:
                dec_cls.append(score_head[i](output))
                dec_bboxes.append(refined_bbox)
                break

            last_refined_bbox = refined_bbox
            refer_bbox = refined_bbox.detach() if self.training else refined_bbox

        return torch.stack(dec_bboxes), torch.stack(dec_cls)

__init__(hidden_dim, decoder_layer, num_layers, eval_idx=-1)

दिए गए मापदंडों के साथ DeformableTransformerDecoder को प्रारंभ करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
380 381 382 383 384385 386
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
    """Initialize the DeformableTransformerDecoder with the given parameters."""
    super().__init__()
    self.layers = _get_clones(decoder_layer, num_layers)
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

forward(embed, refer_bbox, feats, shapes, bbox_head, score_head, pos_mlp, attn_mask=None, padding_mask=None)

पूरे डिकोडर के माध्यम से आगे पास करें।

में स्रोत कोड ultralytics/nn/modules/transformer.py
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415416 417 418419420421 422 423 424 425426
def forward(
    self,
    embed,  # decoder embeddings
    refer_bbox,  # anchor
    feats,  # image features
    shapes,  # feature shapes
    bbox_head,
    score_head,
    pos_mlp,
    attn_mask=None,
    padding_mask=None,
):
    """Perform the forward pass through the entire decoder."""
    output = embed
    dec_bboxes = []
    dec_cls = []
    last_refined_bbox = None
    refer_bbox = refer_bbox.sigmoid()
    for i, layer in enumerate(self.layers):
        output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))

        bbox = bbox_head[i](output)
        refined_bbox = torch.sigmoid(bbox + inverse_sigmoid(refer_bbox))

        if self.training:
            dec_cls.append(score_head[i](output))
            if i == 0:
                dec_bboxes.append(refined_bbox)
            else:
                dec_bboxes.append(torch.sigmoid(bbox + inverse_sigmoid(last_refined_bbox)))
        elif i == self.eval_idx:
            dec_cls.append(score_head[i](output))
            dec_bboxes.append(refined_bbox)
            break

        last_refined_bbox = refined_bbox
        refer_bbox = refined_bbox.detach() if self.training else refined_bbox

    return torch.stack(dec_bboxes), torch.stack(dec_cls)





2023-11-12 बनाया गया, अपडेट किया गया 2023-11-25
लेखक: ग्लेन-जोचर (3)