chandar-lab
/

SaAMPLIFY_350M

@@ -134,13 +134,13 @@ class EncoderBlock(nn.Module):
         self.ffn_dropout = nn.Dropout(config.dropout_prob)
-    def forward(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
-        attn, contact = self._att_block(self.attention_norm(x), pad_mask, freqs_cis, output_attentions)
         x = x + attn
         x = x + self._ff_block(self.ffn_norm(x))
         return x, contact
-    def _att_block(self, x: torch.Tensor, pad_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
         batch_size, seq_len, _ = x.shape
         xq, xk, xv = self.q(x), self.k(x), self.v(x)
@@ -154,8 +154,8 @@ class EncoderBlock(nn.Module):
         attn_weights = None
         if output_attentions:
             attn_weights = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
-            if pad_mask is not None:
-                attn_weights = attn_weights + pad_mask
             attn_weights = attn_weights.softmax(-1)
         # Compute the attention using xformers if the tensors are on GPU
@@ -166,7 +166,7 @@ class EncoderBlock(nn.Module):
                 query=xq,
                 key=xk,
                 value=xv,
-                attn_bias=pad_mask,
                 p=self.config.dropout_prob if self.training else 0,
             )
         else:
@@ -175,7 +175,7 @@ class EncoderBlock(nn.Module):
                 query=xq.transpose(1, 2),
                 key=xk.transpose(1, 2),
                 value=xv.transpose(1, 2),
-                attn_mask=pad_mask,
                 dropout_p=self.config.dropout_prob if self.training else 0,
             ).transpose(1, 2)
@@ -249,27 +249,27 @@ class AMPLIFY(AMPLIFYPreTrainedModel):
         return model, tokenizer
-    def forward(self, src, pad_mask=None, output_hidden_states=False, output_attentions=False):
         # Initialize
         hidden_states, attentions = [], []
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
-        if pad_mask is not None:
-            assert pad_mask.dtype != torch.bool and 1.0 not in pad_mask, "AMPLIFY expects an additive pad_mask"
-            pad_mask = pad_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, pad_mask.size(-1), 1)
         # RoPE
-        self.freqs_cis = self.freqs_cis.to(src.device, non_blocking=True)
-        freqs_cis = self.freqs_cis[: src.shape[1]]
         # Embedding
-        x = self.encoder(src)
         if self.config.layer_norm_after_embedding:
             x = self.layer_norm_1(x)
         # Transformer encoder
         for layer in self.transformer_encoder:
-            x, attn = layer(x, pad_mask, freqs_cis, output_attentions)
             if output_hidden_states:
                 hidden_states.append(x)
             if output_attentions:

         self.ffn_dropout = nn.Dropout(config.dropout_prob)
+    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
+        attn, contact = self._att_block(self.attention_norm(x), attention_mask, freqs_cis, output_attentions)
         x = x + attn
         x = x + self._ff_block(self.ffn_norm(x))
         return x, contact
+    def _att_block(self, x: torch.Tensor, attention_mask: torch.Tensor, freqs_cis: torch.Tensor, output_attentions: bool):
         batch_size, seq_len, _ = x.shape
         xq, xk, xv = self.q(x), self.k(x), self.v(x)
         attn_weights = None
         if output_attentions:
             attn_weights = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
+            if attention_mask is not None:
+                attn_weights = attn_weights + attention_mask
             attn_weights = attn_weights.softmax(-1)
         # Compute the attention using xformers if the tensors are on GPU
                 query=xq,
                 key=xk,
                 value=xv,
+                attn_bias=attention_mask,
                 p=self.config.dropout_prob if self.training else 0,
             )
         else:
                 query=xq.transpose(1, 2),
                 key=xk.transpose(1, 2),
                 value=xv.transpose(1, 2),
+                attn_mask=attention_mask,
                 dropout_p=self.config.dropout_prob if self.training else 0,
             ).transpose(1, 2)
         return model, tokenizer
+    def forward(self, input_ids, attention_mask=None, output_hidden_states=False, output_attentions=False):
         # Initialize
         hidden_states, attentions = [], []
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
+        if attention_mask is not None:
+            assert attention_mask.dtype != torch.bool and 1.0 not in attention_mask, "AMPLIFY expects an additive attention_mask"
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
         # RoPE
+        self.freqs_cis = self.freqs_cis.to(input_ids.device, non_blocking=True)
+        freqs_cis = self.freqs_cis[: input_ids.shape[1]]
         # Embedding
+        x = self.encoder(input_ids)
         if self.config.layer_norm_after_embedding:
             x = self.layer_norm_1(x)
         # Transformer encoder
         for layer in self.transformer_encoder:
+            x, attn = layer(x, attention_mask, freqs_cis, output_attentions)
             if output_hidden_states:
                 hidden_states.append(x)
             if output_attentions: