Kwai-Klear
/

Klear-46B-A2.5B-Base

@@ -552,73 +552,6 @@ class KlearModel(KlearPreTrainedModel):
         )
-def load_balancing_loss_func(
-    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
-    num_experts: Optional[int] = None,
-    top_k: int = 2,
-    attention_mask: Optional[torch.Tensor] = None,
-    moe_aux_loss_coeff: float = 1,
-) -> torch.Tensor:
-    """
-    Computes sequence-level auxiliary load balancing loss for MoE gating.
-    Args:
-        gate_logits: Tensor of shape [batch_size, seq_len, num_experts]
-            or a tuple of such tensors (for multiple towers).
-        num_experts: Number of experts (inferred from gate_logits if None).
-        top_k: Number of top experts chosen per token.
-        attention_mask: Optional mask [batch_size, seq_len], 1 for valid tokens, 0 for padding.
-        moe_aux_loss_coeff: Scaling coefficient for the balancing loss.
-    Returns:
-        A scalar tensor representing the load balancing loss.
-    """
-    # Merge towers if provided
-    if isinstance(gate_logits, tuple):
-        gate_logits = torch.cat(gate_logits, dim=0)
-    assert gate_logits is not None, "gate_logits must be provided"
-    batch_size, seq_len, n_experts = gate_logits.shape
-    num_experts = n_experts if num_experts is None else num_experts
-    assert num_experts == n_experts, f"num_experts ({num_experts}) != gate dimension ({n_experts})"
-    # Compute gating probabilities
-    gate_probs = F.softmax(gate_logits, dim=-1)
-    # Optionally mask padding tokens
-    if attention_mask is not None:
-        mask = attention_mask.float().unsqueeze(-1)  # [batch, seq, 1]
-    else:
-        mask = torch.ones(batch_size, seq_len, 1, device=gate_logits.device)
-    # Select top_k experts per token
-    topk_vals, topk_idx = torch.topk(gate_probs, top_k, dim=-1)  # both [batch, seq, top_k]
-    # Build one-hot mask of assignments
-    one_hot = F.one_hot(topk_idx, num_experts).float()  # [batch, seq, top_k, num_experts]
-    # Sum along top_k to combine multiple choices
-    expert_mask = one_hot.sum(dim=2)  # [batch, seq, num_experts]
-    # Apply token mask
-    expert_mask = expert_mask * mask  # zeros out padding
-    gate_probs_masked = gate_probs * mask
-    # Normalizer: number of valid tokens per sample
-    tokens_per_sample = mask.sum(dim=1).clamp(min=1.0)  # [batch, 1]
-    # Sequence-level tokens per expert: fraction of tokens routed to each expert per sample
-    tokens_per_expert = expert_mask.sum(dim=1).div_(tokens_per_sample * top_k / num_experts)  # [batch, num_experts]
-    # Sequence-level average probability per expert per sample
-    router_prob_per_expert = gate_probs_masked.sum(dim=1).div(tokens_per_sample)  # [batch, num_experts]
-    # Compute loss per sample: encourage uniform load
-    # Loss = sum_e (tokens_e * probs_e)
-    loss_per_sample = (tokens_per_expert * router_prob_per_expert).sum(dim=1)  # [batch]
-    # Average across batch and scale
-    loss = moe_aux_loss_coeff * loss_per_sample.mean()
-    return loss
 @auto_docstring
 class KlearMoeForCausalLM(KlearPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
@@ -720,16 +653,8 @@ class KlearMoeForCausalLM(KlearPreTrainedModel, GenerationMixin):
         aux_loss = None
         if output_router_logits:
-            aux_loss = load_balancing_loss_func(
-                outputs.router_logits,
-                self.num_experts,
-                self.num_experts_per_tok,
-                attention_mask,
-                self.moe_aux_loss_coeff,
-            )
-            if labels is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,

         )
 @auto_docstring
 class KlearMoeForCausalLM(KlearPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
         aux_loss = None
         if output_router_logits:
+            pass
         return MoeCausalLMOutputWithPast(
             loss=loss,
             aux_loss=aux_loss,