Efficient-Large-Model
/

Fast_dLLM_v2_7B

Safetensors

English

Fast_dLLM_Qwen

custom_code

Model card Files Files and versions

xet

Community

Chengyue Wu commited on Oct 11

Commit

7eec723

1 Parent(s): 19930b4

update training

Browse files

Files changed (1) hide show

modeling.py +129 -23

modeling.py CHANGED Viewed

@@ -36,6 +36,55 @@ class BaseModelOutputWithPastAndBlockCache(BaseModelOutputWithPast):
     block_past_key_values: Optional[Cache] = None
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
     # Compute block indices
     block_q = q_idx // block_size
@@ -180,20 +229,24 @@ class Fast_dLLM_QwenAttention(nn.Module):
                 key_states = torch.cat((past_key_value[self.layer_idx][0], key_states), dim=-2)
                 value_states = torch.cat((past_key_value[self.layer_idx][1], value_states), dim=-2)
-        attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            is_causal=False,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            sliding_window=self.sliding_window,  # main diff with Llama
-            **kwargs,
-        )
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
@@ -376,6 +429,13 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
         )
         return mask
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -407,23 +467,31 @@ class Fast_dLLM_QwenModel(Fast_dLLM_QwenPreTrainedModel):
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            if use_block_cache:
-                block_start_position = past_seen_tokens+replace_position if replace_position is not None else past_seen_tokens
                 cache_position = torch.arange(
-                    block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
                 )
             else:
-                cache_position = torch.arange(
-                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
-                )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        if use_block_cache and block_past_key_values.get_seq_length() != 0:
-            attention_mask = None
         else:
-            attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
         hidden_states = inputs_embeds
@@ -503,9 +571,45 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         use_block_cache: Optional[bool] = False,
         block_past_key_values: Optional[Cache] = None,
         replace_position: Optional[int] = None,
         **kwargs
     ) -> CausalLMOutputWithPastAndBlockCache:
         outputs: BaseModelOutputWithPastAndBlockCache = self.model(
             input_ids=input_ids,
             labels=labels,
@@ -524,6 +628,8 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
         )
         hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])

     block_past_key_values: Optional[Cache] = None
+@torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
+def fused_flex_attention(q, k, v, mask=None):
+    return flex_attention(q, k, v, block_mask=mask, enable_gqa=True)
+def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
+    """
+    Constructs the specialized block diffusion attention mask for training
+    composed of three masks:
+    - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
+    - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
+    - **Block Causal Mask (M_BC)**: Attention to update x0
+    Args:
+        b, h: Batch and head indices (ignored for mask logic).
+        q_idx, kv_idx: Query and Key indices.
+        seq_len: Total sequence length.
+        block_size: Defines the block structure.
+    Returns:
+        A boolean attention mask.
+    """
+    # Indicate whether token belongs to xt or x0
+    x0_flag_q = (q_idx >= n)
+    x0_flag_kv = (kv_idx >= n)
+    # Compute block indices
+    block_q = torch.where(x0_flag_q == 1,
+                        (q_idx - n) // block_size,
+                        q_idx // block_size)
+    block_kv = torch.where(x0_flag_kv == 1,
+                        (kv_idx - n) // block_size,
+                        kv_idx // block_size)
+    # **1. Block Diagonal Mask (M_BD) **
+    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
+    # **2. Offset Block-Causal Mask (M_OBC) **
+    offset_block_causal = (
+    (block_q > block_kv)
+    & (x0_flag_kv == 1)
+    & (x0_flag_q == 0)
+    )
+    # **3. Block-Causal Mask (M_BC) **
+    block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
+    # **4. Combine Masks **
+    return block_diagonal | offset_block_causal | block_causal
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
     # Compute block indices
     block_q = q_idx // block_size
                 key_states = torch.cat((past_key_value[self.layer_idx][0], key_states), dim=-2)
                 value_states = torch.cat((past_key_value[self.layer_idx][1], value_states), dim=-2)
+        if self.training:
+            attn_output = fused_flex_attention(query_states, key_states, value_states, mask=attention_mask)
+            attn_output = attn_output.transpose(1, 2).contiguous()
+        else:
+            attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                is_causal=False,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                scaling=self.scaling,
+                sliding_window=self.sliding_window,  # main diff with Llama
+                **kwargs,
+            )
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
         )
         return mask
+    def gen_mask(self, seqlen, block_size, B, H):
+        mask = create_block_mask(
+            partial(block_diff_mask, block_size=block_size, n=seqlen),
+            B=B, H=H, Q_LEN=seqlen*2, KV_LEN=seqlen*2)
+        return mask
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            if self.training:
                 cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]//2, device=inputs_embeds.device
                 )
             else:
+                if use_block_cache:
+                    block_start_position = past_seen_tokens+replace_position if replace_position is not None else past_seen_tokens
+                    cache_position = torch.arange(
+                        block_start_position, block_start_position + inputs_embeds.shape[1], device=inputs_embeds.device
+                    )
+                else:
+                    cache_position = torch.arange(
+                        past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] if not self.training else inputs_embeds.shape[1]//2, device=inputs_embeds.device
+                    )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        if self.training:
+            attention_mask = self.gen_mask(labels.shape[1], self.bd_size, labels.shape[0], self.config.num_attention_heads).to(device=inputs_embeds.device)
         else:
+            if use_block_cache and block_past_key_values.get_seq_length() != 0:
+                attention_mask = None
+            else:
+                attention_mask = self.eval_mask(input_ids.shape[1], block_size, past_key_values.get_seq_length() if past_key_values is not None else 0).to(device=inputs_embeds.device)
         hidden_states = inputs_embeds
         use_block_cache: Optional[bool] = False,
         block_past_key_values: Optional[Cache] = None,
         replace_position: Optional[int] = None,
+        mask_id: Optional[int] = 151665,
         **kwargs
     ) -> CausalLMOutputWithPastAndBlockCache:
+        if self.training:
+            original_labels = labels.clone()
+            original_input_ids = input_ids.clone()
+            noisy_input_ids = input_ids.clone()
+            input_ids = input_ids.reshape(input_ids.shape[0] * input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            b, l = input_ids.shape
+            t = torch.rand((b,), device=input_ids.device)
+            eps=1e-3
+            p_mask = (1 - eps) * t + eps
+            p_mask = p_mask[:, None].repeat(1, l)
+            mask_indices = torch.rand((b, l), device=input_ids.device) < p_mask
+            x_t = torch.where(mask_indices, mask_id, input_ids).reshape(labels.shape)
+            noisy_input_ids[labels != -100] = x_t[labels != -100]
+            mask = (noisy_input_ids != mask_id)
+            labels[mask] = -100
+            input_ids = torch.cat([noisy_input_ids, input_ids.reshape(labels.shape)], dim=1)
+            complementary_noisy_input_ids = original_input_ids.clone()
+            complementary_labels = original_labels.clone()
+            complementary_input_ids = original_input_ids.reshape(original_input_ids.shape[0] * original_input_ids.shape[1] // self.model.bd_size, self.model.bd_size)
+            complementary_mask_indices = ~mask_indices
+            complementary_x_t = torch.where(complementary_mask_indices, mask_id, complementary_input_ids).reshape(labels.shape)
+            complementary_noisy_input_ids[complementary_labels != -100] = complementary_x_t[complementary_labels != -100]
+            complementary_mask = (complementary_noisy_input_ids != mask_id)
+            complementary_labels[complementary_mask] = -100
+            complementary_input_ids = torch.cat([complementary_noisy_input_ids, complementary_input_ids.reshape(complementary_labels.shape)], dim=1)
+            input_ids = torch.cat([input_ids, complementary_input_ids], dim=0)
+            labels = torch.cat([labels, complementary_labels], dim=0)
         outputs: BaseModelOutputWithPastAndBlockCache = self.model(
             input_ids=input_ids,
             labels=labels,
         )
         hidden_states = outputs.last_hidden_state
+        if self.training:
+            hidden_states = hidden_states[:, :hidden_states.shape[1]//2, :]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])