deepseek-ai
/

DeepSeek-V2

@@ -1,13 +1,13 @@
 {
   "architectures": [
-    "DeepseekForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_deepseek.DeepseekConfig",
-    "AutoModel": "modeling_deepseek.DeepseekModel",
-    "AutoModelForCausalLM": "modeling_deepseek.DeepseekForCausalLM"
   },
   "aux_loss_alpha": 0.001,
   "bos_token_id": 100000,
@@ -19,7 +19,7 @@
   "intermediate_size": 12288,
   "kv_lora_rank": 512,
   "max_position_embeddings": 163840,
-  "model_type": "deepseek",
   "moe_intermediate_size": 1536,
   "moe_layer_freq": 1,
   "n_group": 8,

 {
   "architectures": [
+    "DeepseekV2ForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration_deepseek.DeepseekV2Config",
+    "AutoModel": "modeling_deepseek.DeepseekV2Model",
+    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
   },
   "aux_loss_alpha": 0.001,
   "bos_token_id": 100000,
   "intermediate_size": 12288,
   "kv_lora_rank": 512,
   "max_position_embeddings": 163840,
+  "model_type": "deepseek_v2",
   "moe_intermediate_size": 1536,
   "moe_layer_freq": 1,
   "n_group": 8,

configuration_deepseek.py CHANGED Viewed

@@ -4,11 +4,11 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class DeepseekConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`DeepseekModel`]. It is used to instantiate an DeepSeek
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the DeepSeek-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -17,7 +17,7 @@ class DeepseekConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 102400):
             Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`DeepseekModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -100,16 +100,16 @@ class DeepseekConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     ```python
-    >>> from transformers import DeepseekModel, DeepseekConfig
-    >>> # Initializing a Deepseek deepseek-7b style configuration
-    >>> configuration = DeepseekConfig()
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "deepseek"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 logger = logging.get_logger(__name__)
 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class DeepseekV2Config(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V2.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 102400):
             Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV2Model`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
             The dropout ratio for the attention probabilities.
     ```python
+    >>> from transformers import DeepseekV2Model, DeepseekV2Config
+    >>> # Initializing a Deepseek-V2 style configuration
+    >>> configuration = DeepseekV2Config()
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "deepseek_v2"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

modeling_deepseek.py CHANGED Viewed

@@ -55,7 +55,7 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_deepseek import DeepseekConfig
 import torch.distributed as dist
 import numpy as np
@@ -75,7 +75,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "DeepseekConfig"
 def _get_unpad_data(attention_mask):
@@ -92,34 +92,10 @@ def _get_unpad_data(attention_mask):
     )
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    warnings.warn(
-        "Calling `transformers.models.Deepseek.modeling_Deepseek._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
-    )
-    return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    device: torch.device,
-    past_key_values_length: int = 0,
-):
-    warnings.warn(
-        "Calling `transformers.models.Deepseek.modeling_Deepseek._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.Deepseek.modeling_Deepseek.AttentionMaskConverter._make_causal_mask"
-    )
-    return AttentionMaskConverter._make_causal_mask(
-        input_ids_shape=input_ids_shape,
-        dtype=dtype,
-        device=device,
-        past_key_values_length=past_key_values_length,
-    )
-class DeepseekRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        DeepseekRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -133,10 +109,10 @@ class DeepseekRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-ALL_LAYERNORM_LAYERS.append(DeepseekRMSNorm)
-class DeepseekRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -179,9 +155,9 @@ class DeepseekRotaryEmbedding(nn.Module):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Deepseek
-class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
-    """DeepseekRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(
         self,
@@ -208,9 +184,9 @@ class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
-class DeepseekDynamicNTKScalingRotaryEmbedding(DeepseekRotaryEmbedding):
-    """DeepseekRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(
         self,
@@ -284,7 +260,7 @@ def yarn_linear_ramp_mask(min, max, dim):
     return ramp_func
-class DeepseekYarnRotaryEmbedding(DeepseekRotaryEmbedding):
     def __init__(
         self,
@@ -396,7 +372,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed, k_embed
-class DeepseekMLP(nn.Module):
     def __init__(self, config, hidden_size=None, intermediate_size=None):
         super().__init__()
         self.config = config
@@ -543,7 +519,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
         return grad_output, grad_loss
-class DeepseekMoE(nn.Module):
     """
     A mixed expert module containing shared experts.
     """
@@ -561,7 +537,7 @@ class DeepseekMoE(nn.Module):
             self.experts = nn.ModuleList(
                 [
                     (
-                        DeepseekMLP(
                             config, intermediate_size=config.moe_intermediate_size
                         )
                         if i >= self.ep_rank * self.experts_per_rank
@@ -577,14 +553,14 @@ class DeepseekMoE(nn.Module):
             self.ep_rank = 0
             self.experts = nn.ModuleList(
                 [
-                    DeepseekMLP(config, intermediate_size=config.moe_intermediate_size)
                     for i in range(config.n_routed_experts)
                 ]
             )
         self.gate = MoEGate(config)
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
-            self.shared_experts = DeepseekMLP(
                 config=config, intermediate_size=intermediate_size
             )
@@ -702,11 +678,11 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->Deepseek
-class DeepseekAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: DeepseekConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -735,7 +711,7 @@ class DeepseekAttention(nn.Module):
         self.q_a_proj = nn.Linear(
             self.hidden_size, config.q_lora_rank, bias=config.attention_bias
         )
-        self.q_a_layernorm = DeepseekRMSNorm(config.q_lora_rank)
         self.q_b_proj = nn.Linear(
             config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
         )
@@ -745,7 +721,7 @@ class DeepseekAttention(nn.Module):
             config.kv_lora_rank + config.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = DeepseekRMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
             self.num_heads
@@ -770,7 +746,7 @@ class DeepseekAttention(nn.Module):
     def _init_rope(self):
         if self.config.rope_scaling is None:
-            self.rotary_emb = DeepseekRotaryEmbedding(
                 self.qk_rope_head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
@@ -779,14 +755,14 @@ class DeepseekAttention(nn.Module):
             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             if scaling_type == "linear":
-                self.rotary_emb = DeepseekLinearScalingRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             elif scaling_type == "dynamic":
-                self.rotary_emb = DeepseekDynamicNTKScalingRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
@@ -804,7 +780,7 @@ class DeepseekAttention(nn.Module):
                     ]
                     if key in self.config.rope_scaling
                 }
-                self.rotary_emb = DeepseekYarnRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
@@ -927,10 +903,10 @@ class DeepseekAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Deepseek
-class DeepseekFlashAttention2(DeepseekAttention):
     """
-    Deepseek flash attention module. This module inherits from `DeepseekAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -953,7 +929,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # DeepseekFlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -1027,7 +1003,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (DeepseekRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
@@ -1103,7 +1079,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
@@ -1198,13 +1174,13 @@ class DeepseekFlashAttention2(DeepseekAttention):
 ATTENTION_CLASSES = {
-    "eager": DeepseekAttention,
-    "flash_attention_2": DeepseekFlashAttention2,
 }
-class DeepseekDecoderLayer(nn.Module):
-    def __init__(self, config: DeepseekConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1213,18 +1189,18 @@ class DeepseekDecoderLayer(nn.Module):
         )
         self.mlp = (
-            DeepseekMoE(config)
             if (
                 config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0
             )
-            else DeepseekMLP(config)
         )
-        self.input_layernorm = DeepseekRMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
-        self.post_attention_layernorm = DeepseekRMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
@@ -1291,7 +1267,7 @@ class DeepseekDecoderLayer(nn.Module):
         return outputs
-Deepseek_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -1301,7 +1277,7 @@ Deepseek_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`DeepseekConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1309,14 +1285,14 @@ Deepseek_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Deepseek Model outputting raw hidden-states without any specific head on top.",
-    Deepseek_START_DOCSTRING,
 )
-class DeepseekPreTrainedModel(PreTrainedModel):
-    config_class = DeepseekConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["DeepseekDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -1334,7 +1310,7 @@ class DeepseekPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-Deepseek_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -1405,18 +1381,18 @@ Deepseek_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Deepseek Model outputting raw hidden-states without any specific head on top.",
-    Deepseek_START_DOCSTRING,
 )
-class DeepseekModel(DeepseekPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekDecoderLayer`]
     Args:
-        config: DeepseekConfig
     """
-    def __init__(self, config: DeepseekConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -1426,13 +1402,13 @@ class DeepseekModel(DeepseekPreTrainedModel):
         )
         self.layers = nn.ModuleList(
             [
-                DeepseekDecoderLayer(config, layer_idx)
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
         self._use_sdpa = config._attn_implementation == "sdpa"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self.norm = DeepseekRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1444,7 +1420,7 @@ class DeepseekModel(DeepseekPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1604,12 +1580,12 @@ class DeepseekModel(DeepseekPreTrainedModel):
         )
-class DeepseekForCausalLM(DeepseekPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = DeepseekModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1634,7 +1610,7 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
     )
@@ -1663,9 +1639,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
-        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
@@ -1811,9 +1787,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
 @add_start_docstrings(
     """
-    The Deepseek Model transformer with a sequence classification head on top (linear layer).
-    [`DeepseekForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -1822,13 +1798,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    Deepseek_START_DOCSTRING,
 )
-class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = DeepseekModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1840,7 +1816,7 @@ class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(Deepseek_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_deepseek import DeepseekV2Config
 import torch.distributed as dist
 import numpy as np
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "DeepseekV2Config"
 def _get_unpad_data(attention_mask):
     )
+class DeepseekV2RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        DeepseekV2RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)
+class DeepseekV2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         )
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
+class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
+    """DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
     def __init__(
         self,
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
+class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
+    """DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
     def __init__(
         self,
     return ramp_func
+class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
     def __init__(
         self,
     return q_embed, k_embed
+class DeepseekV2MLP(nn.Module):
     def __init__(self, config, hidden_size=None, intermediate_size=None):
         super().__init__()
         self.config = config
         return grad_output, grad_loss
+class DeepseekV2MoE(nn.Module):
     """
     A mixed expert module containing shared experts.
     """
             self.experts = nn.ModuleList(
                 [
                     (
+                        DeepseekV2MLP(
                             config, intermediate_size=config.moe_intermediate_size
                         )
                         if i >= self.ep_rank * self.experts_per_rank
             self.ep_rank = 0
             self.experts = nn.ModuleList(
                 [
+                    DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
                     for i in range(config.n_routed_experts)
                 ]
             )
         self.gate = MoEGate(config)
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV2MLP(
                 config=config, intermediate_size=intermediate_size
             )
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
+class DeepseekV2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.q_a_proj = nn.Linear(
             self.hidden_size, config.q_lora_rank, bias=config.attention_bias
         )
+        self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
         self.q_b_proj = nn.Linear(
             config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
         )
             config.kv_lora_rank + config.qk_rope_head_dim,
             bias=config.attention_bias,
         )
+        self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
         self.kv_b_proj = nn.Linear(
             config.kv_lora_rank,
             self.num_heads
     def _init_rope(self):
         if self.config.rope_scaling is None:
+            self.rotary_emb = DeepseekV2RotaryEmbedding(
                 self.qk_rope_head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             if scaling_type == "linear":
+                self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
                     base=self.rope_theta,
                 )
             elif scaling_type == "dynamic":
+                self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
                     ]
                     if key in self.config.rope_scaling
                 }
+                self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
                     self.qk_rope_head_dim,
                     max_position_embeddings=self.max_position_embeddings,
                     scaling_factor=scaling_factor,
         return attn_output, attn_weights, past_key_value
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
+class DeepseekV2FlashAttention2(DeepseekV2Attention):
     """
+    DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # DeepseekV2FlashAttention2 attention does not support output_attentions
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
         # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DeepseekV2RMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
         if not self._flash_attn_uses_top_left_mask:
             causal = self.is_causal
         else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
 ATTENTION_CLASSES = {
+    "eager": DeepseekV2Attention,
+    "flash_attention_2": DeepseekV2FlashAttention2,
 }
+class DeepseekV2DecoderLayer(nn.Module):
+    def __init__(self, config: DeepseekV2Config, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         )
         self.mlp = (
+            DeepseekV2MoE(config)
             if (
                 config.n_routed_experts is not None
                 and layer_idx >= config.first_k_dense_replace
                 and layer_idx % config.moe_layer_freq == 0
             )
+            else DeepseekV2MLP(config)
         )
+        self.input_layernorm = DeepseekV2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
+        self.post_attention_layernorm = DeepseekV2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
         return outputs
+DeepseekV2_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`DeepseekV2Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
+    DeepseekV2_START_DOCSTRING,
 )
+class DeepseekV2PreTrainedModel(PreTrainedModel):
+    config_class = DeepseekV2Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["DeepseekV2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
+DeepseekV2_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
+    "The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
+    DeepseekV2_START_DOCSTRING,
 )
+class DeepseekV2Model(DeepseekV2PreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
     Args:
+        config: DeepseekV2Config
     """
+    def __init__(self, config: DeepseekV2Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         )
         self.layers = nn.ModuleList(
             [
+                DeepseekV2DecoderLayer(config, layer_idx)
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
         self._use_sdpa = config._attn_implementation == "sdpa"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         )
+class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = DeepseekV2Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
     )
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
+        >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
 @add_start_docstrings(
     """
+    The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
+    [`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    DeepseekV2_START_DOCSTRING,
 )
+class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = DeepseekV2Model(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,