msr2000
commited on
Commit
·
44f7caf
1
Parent(s):
3d445e7
Update model names
Browse files- config.json +5 -5
- configuration_deepseek.py +8 -8
- modeling_deepseek.py +69 -93
config.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"attention_bias": false,
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"auto_map": {
|
| 8 |
-
"AutoConfig": "configuration_deepseek.
|
| 9 |
-
"AutoModel": "modeling_deepseek.
|
| 10 |
-
"AutoModelForCausalLM": "modeling_deepseek.
|
| 11 |
},
|
| 12 |
"aux_loss_alpha": 0.001,
|
| 13 |
"bos_token_id": 100000,
|
|
@@ -19,7 +19,7 @@
|
|
| 19 |
"intermediate_size": 12288,
|
| 20 |
"kv_lora_rank": 512,
|
| 21 |
"max_position_embeddings": 163840,
|
| 22 |
-
"model_type": "
|
| 23 |
"moe_intermediate_size": 1536,
|
| 24 |
"moe_layer_freq": 1,
|
| 25 |
"n_group": 8,
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"DeepseekV2ForCausalLM"
|
| 4 |
],
|
| 5 |
"attention_bias": false,
|
| 6 |
"attention_dropout": 0.0,
|
| 7 |
"auto_map": {
|
| 8 |
+
"AutoConfig": "configuration_deepseek.DeepseekV2Config",
|
| 9 |
+
"AutoModel": "modeling_deepseek.DeepseekV2Model",
|
| 10 |
+
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
|
| 11 |
},
|
| 12 |
"aux_loss_alpha": 0.001,
|
| 13 |
"bos_token_id": 100000,
|
|
|
|
| 19 |
"intermediate_size": 12288,
|
| 20 |
"kv_lora_rank": 512,
|
| 21 |
"max_position_embeddings": 163840,
|
| 22 |
+
"model_type": "deepseek_v2",
|
| 23 |
"moe_intermediate_size": 1536,
|
| 24 |
"moe_layer_freq": 1,
|
| 25 |
"n_group": 8,
|
configuration_deepseek.py
CHANGED
|
@@ -4,11 +4,11 @@ from transformers.utils import logging
|
|
| 4 |
logger = logging.get_logger(__name__)
|
| 5 |
|
| 6 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
| 7 |
-
class
|
| 8 |
r"""
|
| 9 |
-
This is the configuration class to store the configuration of a [`
|
| 10 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
| 11 |
-
defaults will yield a similar configuration to that of the DeepSeek-
|
| 12 |
|
| 13 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 14 |
documentation from [`PretrainedConfig`] for more information.
|
|
@@ -17,7 +17,7 @@ class DeepseekConfig(PretrainedConfig):
|
|
| 17 |
Args:
|
| 18 |
vocab_size (`int`, *optional*, defaults to 102400):
|
| 19 |
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
| 20 |
-
`inputs_ids` passed when calling [`
|
| 21 |
hidden_size (`int`, *optional*, defaults to 4096):
|
| 22 |
Dimension of the hidden representations.
|
| 23 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
@@ -100,16 +100,16 @@ class DeepseekConfig(PretrainedConfig):
|
|
| 100 |
The dropout ratio for the attention probabilities.
|
| 101 |
|
| 102 |
```python
|
| 103 |
-
>>> from transformers import
|
| 104 |
|
| 105 |
-
>>> # Initializing a Deepseek
|
| 106 |
-
>>> configuration =
|
| 107 |
|
| 108 |
>>> # Accessing the model configuration
|
| 109 |
>>> configuration = model.config
|
| 110 |
```"""
|
| 111 |
|
| 112 |
-
model_type = "
|
| 113 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 114 |
|
| 115 |
def __init__(
|
|
|
|
| 4 |
logger = logging.get_logger(__name__)
|
| 5 |
|
| 6 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
| 7 |
+
class DeepseekV2Config(PretrainedConfig):
|
| 8 |
r"""
|
| 9 |
+
This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
|
| 10 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
| 11 |
+
defaults will yield a similar configuration to that of the DeepSeek-V2.
|
| 12 |
|
| 13 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 14 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
| 17 |
Args:
|
| 18 |
vocab_size (`int`, *optional*, defaults to 102400):
|
| 19 |
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
| 20 |
+
`inputs_ids` passed when calling [`DeepseekV2Model`]
|
| 21 |
hidden_size (`int`, *optional*, defaults to 4096):
|
| 22 |
Dimension of the hidden representations.
|
| 23 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
|
|
| 100 |
The dropout ratio for the attention probabilities.
|
| 101 |
|
| 102 |
```python
|
| 103 |
+
>>> from transformers import DeepseekV2Model, DeepseekV2Config
|
| 104 |
|
| 105 |
+
>>> # Initializing a Deepseek-V2 style configuration
|
| 106 |
+
>>> configuration = DeepseekV2Config()
|
| 107 |
|
| 108 |
>>> # Accessing the model configuration
|
| 109 |
>>> configuration = model.config
|
| 110 |
```"""
|
| 111 |
|
| 112 |
+
model_type = "deepseek_v2"
|
| 113 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 114 |
|
| 115 |
def __init__(
|
modeling_deepseek.py
CHANGED
|
@@ -55,7 +55,7 @@ from transformers.utils import (
|
|
| 55 |
replace_return_docstrings,
|
| 56 |
)
|
| 57 |
from transformers.utils.import_utils import is_torch_fx_available
|
| 58 |
-
from .configuration_deepseek import
|
| 59 |
import torch.distributed as dist
|
| 60 |
import numpy as np
|
| 61 |
|
|
@@ -75,7 +75,7 @@ if is_torch_fx_available():
|
|
| 75 |
|
| 76 |
logger = logging.get_logger(__name__)
|
| 77 |
|
| 78 |
-
_CONFIG_FOR_DOC = "
|
| 79 |
|
| 80 |
|
| 81 |
def _get_unpad_data(attention_mask):
|
|
@@ -92,34 +92,10 @@ def _get_unpad_data(attention_mask):
|
|
| 92 |
)
|
| 93 |
|
| 94 |
|
| 95 |
-
|
| 96 |
-
warnings.warn(
|
| 97 |
-
"Calling `transformers.models.Deepseek.modeling_Deepseek._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
|
| 98 |
-
)
|
| 99 |
-
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
def _make_causal_mask(
|
| 103 |
-
input_ids_shape: torch.Size,
|
| 104 |
-
dtype: torch.dtype,
|
| 105 |
-
device: torch.device,
|
| 106 |
-
past_key_values_length: int = 0,
|
| 107 |
-
):
|
| 108 |
-
warnings.warn(
|
| 109 |
-
"Calling `transformers.models.Deepseek.modeling_Deepseek._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.Deepseek.modeling_Deepseek.AttentionMaskConverter._make_causal_mask"
|
| 110 |
-
)
|
| 111 |
-
return AttentionMaskConverter._make_causal_mask(
|
| 112 |
-
input_ids_shape=input_ids_shape,
|
| 113 |
-
dtype=dtype,
|
| 114 |
-
device=device,
|
| 115 |
-
past_key_values_length=past_key_values_length,
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
class DeepseekRMSNorm(nn.Module):
|
| 120 |
def __init__(self, hidden_size, eps=1e-6):
|
| 121 |
"""
|
| 122 |
-
|
| 123 |
"""
|
| 124 |
super().__init__()
|
| 125 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
@@ -133,10 +109,10 @@ class DeepseekRMSNorm(nn.Module):
|
|
| 133 |
return self.weight * hidden_states.to(input_dtype)
|
| 134 |
|
| 135 |
|
| 136 |
-
ALL_LAYERNORM_LAYERS.append(
|
| 137 |
|
| 138 |
|
| 139 |
-
class
|
| 140 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
| 141 |
super().__init__()
|
| 142 |
|
|
@@ -179,9 +155,9 @@ class DeepseekRotaryEmbedding(nn.Module):
|
|
| 179 |
)
|
| 180 |
|
| 181 |
|
| 182 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->
|
| 183 |
-
class
|
| 184 |
-
"""
|
| 185 |
|
| 186 |
def __init__(
|
| 187 |
self,
|
|
@@ -208,9 +184,9 @@ class DeepseekLinearScalingRotaryEmbedding(DeepseekRotaryEmbedding):
|
|
| 208 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 209 |
|
| 210 |
|
| 211 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->
|
| 212 |
-
class
|
| 213 |
-
"""
|
| 214 |
|
| 215 |
def __init__(
|
| 216 |
self,
|
|
@@ -284,7 +260,7 @@ def yarn_linear_ramp_mask(min, max, dim):
|
|
| 284 |
return ramp_func
|
| 285 |
|
| 286 |
|
| 287 |
-
class
|
| 288 |
|
| 289 |
def __init__(
|
| 290 |
self,
|
|
@@ -396,7 +372,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
| 396 |
return q_embed, k_embed
|
| 397 |
|
| 398 |
|
| 399 |
-
class
|
| 400 |
def __init__(self, config, hidden_size=None, intermediate_size=None):
|
| 401 |
super().__init__()
|
| 402 |
self.config = config
|
|
@@ -543,7 +519,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
|
|
| 543 |
return grad_output, grad_loss
|
| 544 |
|
| 545 |
|
| 546 |
-
class
|
| 547 |
"""
|
| 548 |
A mixed expert module containing shared experts.
|
| 549 |
"""
|
|
@@ -561,7 +537,7 @@ class DeepseekMoE(nn.Module):
|
|
| 561 |
self.experts = nn.ModuleList(
|
| 562 |
[
|
| 563 |
(
|
| 564 |
-
|
| 565 |
config, intermediate_size=config.moe_intermediate_size
|
| 566 |
)
|
| 567 |
if i >= self.ep_rank * self.experts_per_rank
|
|
@@ -577,14 +553,14 @@ class DeepseekMoE(nn.Module):
|
|
| 577 |
self.ep_rank = 0
|
| 578 |
self.experts = nn.ModuleList(
|
| 579 |
[
|
| 580 |
-
|
| 581 |
for i in range(config.n_routed_experts)
|
| 582 |
]
|
| 583 |
)
|
| 584 |
self.gate = MoEGate(config)
|
| 585 |
if config.n_shared_experts is not None:
|
| 586 |
intermediate_size = config.moe_intermediate_size * config.n_shared_experts
|
| 587 |
-
self.shared_experts =
|
| 588 |
config=config, intermediate_size=intermediate_size
|
| 589 |
)
|
| 590 |
|
|
@@ -702,11 +678,11 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
| 702 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
| 703 |
|
| 704 |
|
| 705 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->
|
| 706 |
-
class
|
| 707 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 708 |
|
| 709 |
-
def __init__(self, config:
|
| 710 |
super().__init__()
|
| 711 |
self.config = config
|
| 712 |
self.layer_idx = layer_idx
|
|
@@ -735,7 +711,7 @@ class DeepseekAttention(nn.Module):
|
|
| 735 |
self.q_a_proj = nn.Linear(
|
| 736 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
| 737 |
)
|
| 738 |
-
self.q_a_layernorm =
|
| 739 |
self.q_b_proj = nn.Linear(
|
| 740 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
| 741 |
)
|
|
@@ -745,7 +721,7 @@ class DeepseekAttention(nn.Module):
|
|
| 745 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
| 746 |
bias=config.attention_bias,
|
| 747 |
)
|
| 748 |
-
self.kv_a_layernorm =
|
| 749 |
self.kv_b_proj = nn.Linear(
|
| 750 |
config.kv_lora_rank,
|
| 751 |
self.num_heads
|
|
@@ -770,7 +746,7 @@ class DeepseekAttention(nn.Module):
|
|
| 770 |
|
| 771 |
def _init_rope(self):
|
| 772 |
if self.config.rope_scaling is None:
|
| 773 |
-
self.rotary_emb =
|
| 774 |
self.qk_rope_head_dim,
|
| 775 |
max_position_embeddings=self.max_position_embeddings,
|
| 776 |
base=self.rope_theta,
|
|
@@ -779,14 +755,14 @@ class DeepseekAttention(nn.Module):
|
|
| 779 |
scaling_type = self.config.rope_scaling["type"]
|
| 780 |
scaling_factor = self.config.rope_scaling["factor"]
|
| 781 |
if scaling_type == "linear":
|
| 782 |
-
self.rotary_emb =
|
| 783 |
self.qk_rope_head_dim,
|
| 784 |
max_position_embeddings=self.max_position_embeddings,
|
| 785 |
scaling_factor=scaling_factor,
|
| 786 |
base=self.rope_theta,
|
| 787 |
)
|
| 788 |
elif scaling_type == "dynamic":
|
| 789 |
-
self.rotary_emb =
|
| 790 |
self.qk_rope_head_dim,
|
| 791 |
max_position_embeddings=self.max_position_embeddings,
|
| 792 |
scaling_factor=scaling_factor,
|
|
@@ -804,7 +780,7 @@ class DeepseekAttention(nn.Module):
|
|
| 804 |
]
|
| 805 |
if key in self.config.rope_scaling
|
| 806 |
}
|
| 807 |
-
self.rotary_emb =
|
| 808 |
self.qk_rope_head_dim,
|
| 809 |
max_position_embeddings=self.max_position_embeddings,
|
| 810 |
scaling_factor=scaling_factor,
|
|
@@ -927,10 +903,10 @@ class DeepseekAttention(nn.Module):
|
|
| 927 |
return attn_output, attn_weights, past_key_value
|
| 928 |
|
| 929 |
|
| 930 |
-
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->
|
| 931 |
-
class
|
| 932 |
"""
|
| 933 |
-
|
| 934 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
| 935 |
flash attention and deal with padding tokens in case the input contains any of them.
|
| 936 |
"""
|
|
@@ -953,7 +929,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
| 953 |
use_cache: bool = False,
|
| 954 |
**kwargs,
|
| 955 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 956 |
-
#
|
| 957 |
if "padding_mask" in kwargs:
|
| 958 |
warnings.warn(
|
| 959 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
@@ -1027,7 +1003,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
| 1027 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
| 1028 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
| 1029 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
| 1030 |
-
# in fp32. (
|
| 1031 |
|
| 1032 |
input_dtype = query_states.dtype
|
| 1033 |
if input_dtype == torch.float32:
|
|
@@ -1103,7 +1079,7 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
| 1103 |
if not self._flash_attn_uses_top_left_mask:
|
| 1104 |
causal = self.is_causal
|
| 1105 |
else:
|
| 1106 |
-
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in
|
| 1107 |
causal = self.is_causal and query_length != 1
|
| 1108 |
|
| 1109 |
# Contains at least one padding token in the sequence
|
|
@@ -1198,13 +1174,13 @@ class DeepseekFlashAttention2(DeepseekAttention):
|
|
| 1198 |
|
| 1199 |
|
| 1200 |
ATTENTION_CLASSES = {
|
| 1201 |
-
"eager":
|
| 1202 |
-
"flash_attention_2":
|
| 1203 |
}
|
| 1204 |
|
| 1205 |
|
| 1206 |
-
class
|
| 1207 |
-
def __init__(self, config:
|
| 1208 |
super().__init__()
|
| 1209 |
self.hidden_size = config.hidden_size
|
| 1210 |
|
|
@@ -1213,18 +1189,18 @@ class DeepseekDecoderLayer(nn.Module):
|
|
| 1213 |
)
|
| 1214 |
|
| 1215 |
self.mlp = (
|
| 1216 |
-
|
| 1217 |
if (
|
| 1218 |
config.n_routed_experts is not None
|
| 1219 |
and layer_idx >= config.first_k_dense_replace
|
| 1220 |
and layer_idx % config.moe_layer_freq == 0
|
| 1221 |
)
|
| 1222 |
-
else
|
| 1223 |
)
|
| 1224 |
-
self.input_layernorm =
|
| 1225 |
config.hidden_size, eps=config.rms_norm_eps
|
| 1226 |
)
|
| 1227 |
-
self.post_attention_layernorm =
|
| 1228 |
config.hidden_size, eps=config.rms_norm_eps
|
| 1229 |
)
|
| 1230 |
|
|
@@ -1291,7 +1267,7 @@ class DeepseekDecoderLayer(nn.Module):
|
|
| 1291 |
return outputs
|
| 1292 |
|
| 1293 |
|
| 1294 |
-
|
| 1295 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 1296 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 1297 |
etc.)
|
|
@@ -1301,7 +1277,7 @@ Deepseek_START_DOCSTRING = r"""
|
|
| 1301 |
and behavior.
|
| 1302 |
|
| 1303 |
Parameters:
|
| 1304 |
-
config ([`
|
| 1305 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 1306 |
load the weights associated with the model, only the configuration. Check out the
|
| 1307 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
@@ -1309,14 +1285,14 @@ Deepseek_START_DOCSTRING = r"""
|
|
| 1309 |
|
| 1310 |
|
| 1311 |
@add_start_docstrings(
|
| 1312 |
-
"The bare
|
| 1313 |
-
|
| 1314 |
)
|
| 1315 |
-
class
|
| 1316 |
-
config_class =
|
| 1317 |
base_model_prefix = "model"
|
| 1318 |
supports_gradient_checkpointing = True
|
| 1319 |
-
_no_split_modules = ["
|
| 1320 |
_skip_keys_device_placement = "past_key_values"
|
| 1321 |
_supports_flash_attn_2 = True
|
| 1322 |
_supports_sdpa = True
|
|
@@ -1334,7 +1310,7 @@ class DeepseekPreTrainedModel(PreTrainedModel):
|
|
| 1334 |
module.weight.data[module.padding_idx].zero_()
|
| 1335 |
|
| 1336 |
|
| 1337 |
-
|
| 1338 |
Args:
|
| 1339 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1340 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
@@ -1405,18 +1381,18 @@ Deepseek_INPUTS_DOCSTRING = r"""
|
|
| 1405 |
|
| 1406 |
|
| 1407 |
@add_start_docstrings(
|
| 1408 |
-
"The bare
|
| 1409 |
-
|
| 1410 |
)
|
| 1411 |
-
class
|
| 1412 |
"""
|
| 1413 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
| 1414 |
|
| 1415 |
Args:
|
| 1416 |
-
config:
|
| 1417 |
"""
|
| 1418 |
|
| 1419 |
-
def __init__(self, config:
|
| 1420 |
super().__init__(config)
|
| 1421 |
self.padding_idx = config.pad_token_id
|
| 1422 |
self.vocab_size = config.vocab_size
|
|
@@ -1426,13 +1402,13 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
| 1426 |
)
|
| 1427 |
self.layers = nn.ModuleList(
|
| 1428 |
[
|
| 1429 |
-
|
| 1430 |
for layer_idx in range(config.num_hidden_layers)
|
| 1431 |
]
|
| 1432 |
)
|
| 1433 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
| 1434 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
| 1435 |
-
self.norm =
|
| 1436 |
|
| 1437 |
self.gradient_checkpointing = False
|
| 1438 |
# Initialize weights and apply final processing
|
|
@@ -1444,7 +1420,7 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
| 1444 |
def set_input_embeddings(self, value):
|
| 1445 |
self.embed_tokens = value
|
| 1446 |
|
| 1447 |
-
@add_start_docstrings_to_model_forward(
|
| 1448 |
def forward(
|
| 1449 |
self,
|
| 1450 |
input_ids: torch.LongTensor = None,
|
|
@@ -1604,12 +1580,12 @@ class DeepseekModel(DeepseekPreTrainedModel):
|
|
| 1604 |
)
|
| 1605 |
|
| 1606 |
|
| 1607 |
-
class
|
| 1608 |
_tied_weights_keys = ["lm_head.weight"]
|
| 1609 |
|
| 1610 |
def __init__(self, config):
|
| 1611 |
super().__init__(config)
|
| 1612 |
-
self.model =
|
| 1613 |
self.vocab_size = config.vocab_size
|
| 1614 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 1615 |
|
|
@@ -1634,7 +1610,7 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
| 1634 |
def get_decoder(self):
|
| 1635 |
return self.model
|
| 1636 |
|
| 1637 |
-
@add_start_docstrings_to_model_forward(
|
| 1638 |
@replace_return_docstrings(
|
| 1639 |
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
| 1640 |
)
|
|
@@ -1663,9 +1639,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
| 1663 |
Example:
|
| 1664 |
|
| 1665 |
```python
|
| 1666 |
-
>>> from transformers import AutoTokenizer,
|
| 1667 |
|
| 1668 |
-
>>> model =
|
| 1669 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
| 1670 |
|
| 1671 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
@@ -1811,9 +1787,9 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
| 1811 |
|
| 1812 |
@add_start_docstrings(
|
| 1813 |
"""
|
| 1814 |
-
The
|
| 1815 |
|
| 1816 |
-
[`
|
| 1817 |
(e.g. GPT-2) do.
|
| 1818 |
|
| 1819 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
@@ -1822,13 +1798,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel):
|
|
| 1822 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
| 1823 |
each row of the batch).
|
| 1824 |
""",
|
| 1825 |
-
|
| 1826 |
)
|
| 1827 |
-
class
|
| 1828 |
def __init__(self, config):
|
| 1829 |
super().__init__(config)
|
| 1830 |
self.num_labels = config.num_labels
|
| 1831 |
-
self.model =
|
| 1832 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
| 1833 |
|
| 1834 |
# Initialize weights and apply final processing
|
|
@@ -1840,7 +1816,7 @@ class DeepseekForSequenceClassification(DeepseekPreTrainedModel):
|
|
| 1840 |
def set_input_embeddings(self, value):
|
| 1841 |
self.model.embed_tokens = value
|
| 1842 |
|
| 1843 |
-
@add_start_docstrings_to_model_forward(
|
| 1844 |
def forward(
|
| 1845 |
self,
|
| 1846 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 55 |
replace_return_docstrings,
|
| 56 |
)
|
| 57 |
from transformers.utils.import_utils import is_torch_fx_available
|
| 58 |
+
from .configuration_deepseek import DeepseekV2Config
|
| 59 |
import torch.distributed as dist
|
| 60 |
import numpy as np
|
| 61 |
|
|
|
|
| 75 |
|
| 76 |
logger = logging.get_logger(__name__)
|
| 77 |
|
| 78 |
+
_CONFIG_FOR_DOC = "DeepseekV2Config"
|
| 79 |
|
| 80 |
|
| 81 |
def _get_unpad_data(attention_mask):
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
|
| 95 |
+
class DeepseekV2RMSNorm(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def __init__(self, hidden_size, eps=1e-6):
|
| 97 |
"""
|
| 98 |
+
DeepseekV2RMSNorm is equivalent to T5LayerNorm
|
| 99 |
"""
|
| 100 |
super().__init__()
|
| 101 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
|
| 109 |
return self.weight * hidden_states.to(input_dtype)
|
| 110 |
|
| 111 |
|
| 112 |
+
ALL_LAYERNORM_LAYERS.append(DeepseekV2RMSNorm)
|
| 113 |
|
| 114 |
|
| 115 |
+
class DeepseekV2RotaryEmbedding(nn.Module):
|
| 116 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
| 117 |
super().__init__()
|
| 118 |
|
|
|
|
| 155 |
)
|
| 156 |
|
| 157 |
|
| 158 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV2
|
| 159 |
+
class DeepseekV2LinearScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
| 160 |
+
"""DeepseekV2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
|
| 161 |
|
| 162 |
def __init__(
|
| 163 |
self,
|
|
|
|
| 184 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 185 |
|
| 186 |
|
| 187 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV2
|
| 188 |
+
class DeepseekV2DynamicNTKScalingRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
| 189 |
+
"""DeepseekV2RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
|
| 190 |
|
| 191 |
def __init__(
|
| 192 |
self,
|
|
|
|
| 260 |
return ramp_func
|
| 261 |
|
| 262 |
|
| 263 |
+
class DeepseekV2YarnRotaryEmbedding(DeepseekV2RotaryEmbedding):
|
| 264 |
|
| 265 |
def __init__(
|
| 266 |
self,
|
|
|
|
| 372 |
return q_embed, k_embed
|
| 373 |
|
| 374 |
|
| 375 |
+
class DeepseekV2MLP(nn.Module):
|
| 376 |
def __init__(self, config, hidden_size=None, intermediate_size=None):
|
| 377 |
super().__init__()
|
| 378 |
self.config = config
|
|
|
|
| 519 |
return grad_output, grad_loss
|
| 520 |
|
| 521 |
|
| 522 |
+
class DeepseekV2MoE(nn.Module):
|
| 523 |
"""
|
| 524 |
A mixed expert module containing shared experts.
|
| 525 |
"""
|
|
|
|
| 537 |
self.experts = nn.ModuleList(
|
| 538 |
[
|
| 539 |
(
|
| 540 |
+
DeepseekV2MLP(
|
| 541 |
config, intermediate_size=config.moe_intermediate_size
|
| 542 |
)
|
| 543 |
if i >= self.ep_rank * self.experts_per_rank
|
|
|
|
| 553 |
self.ep_rank = 0
|
| 554 |
self.experts = nn.ModuleList(
|
| 555 |
[
|
| 556 |
+
DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)
|
| 557 |
for i in range(config.n_routed_experts)
|
| 558 |
]
|
| 559 |
)
|
| 560 |
self.gate = MoEGate(config)
|
| 561 |
if config.n_shared_experts is not None:
|
| 562 |
intermediate_size = config.moe_intermediate_size * config.n_shared_experts
|
| 563 |
+
self.shared_experts = DeepseekV2MLP(
|
| 564 |
config=config, intermediate_size=intermediate_size
|
| 565 |
)
|
| 566 |
|
|
|
|
| 678 |
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
| 679 |
|
| 680 |
|
| 681 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV2
|
| 682 |
+
class DeepseekV2Attention(nn.Module):
|
| 683 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 684 |
|
| 685 |
+
def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None):
|
| 686 |
super().__init__()
|
| 687 |
self.config = config
|
| 688 |
self.layer_idx = layer_idx
|
|
|
|
| 711 |
self.q_a_proj = nn.Linear(
|
| 712 |
self.hidden_size, config.q_lora_rank, bias=config.attention_bias
|
| 713 |
)
|
| 714 |
+
self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank)
|
| 715 |
self.q_b_proj = nn.Linear(
|
| 716 |
config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
|
| 717 |
)
|
|
|
|
| 721 |
config.kv_lora_rank + config.qk_rope_head_dim,
|
| 722 |
bias=config.attention_bias,
|
| 723 |
)
|
| 724 |
+
self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank)
|
| 725 |
self.kv_b_proj = nn.Linear(
|
| 726 |
config.kv_lora_rank,
|
| 727 |
self.num_heads
|
|
|
|
| 746 |
|
| 747 |
def _init_rope(self):
|
| 748 |
if self.config.rope_scaling is None:
|
| 749 |
+
self.rotary_emb = DeepseekV2RotaryEmbedding(
|
| 750 |
self.qk_rope_head_dim,
|
| 751 |
max_position_embeddings=self.max_position_embeddings,
|
| 752 |
base=self.rope_theta,
|
|
|
|
| 755 |
scaling_type = self.config.rope_scaling["type"]
|
| 756 |
scaling_factor = self.config.rope_scaling["factor"]
|
| 757 |
if scaling_type == "linear":
|
| 758 |
+
self.rotary_emb = DeepseekV2LinearScalingRotaryEmbedding(
|
| 759 |
self.qk_rope_head_dim,
|
| 760 |
max_position_embeddings=self.max_position_embeddings,
|
| 761 |
scaling_factor=scaling_factor,
|
| 762 |
base=self.rope_theta,
|
| 763 |
)
|
| 764 |
elif scaling_type == "dynamic":
|
| 765 |
+
self.rotary_emb = DeepseekV2DynamicNTKScalingRotaryEmbedding(
|
| 766 |
self.qk_rope_head_dim,
|
| 767 |
max_position_embeddings=self.max_position_embeddings,
|
| 768 |
scaling_factor=scaling_factor,
|
|
|
|
| 780 |
]
|
| 781 |
if key in self.config.rope_scaling
|
| 782 |
}
|
| 783 |
+
self.rotary_emb = DeepseekV2YarnRotaryEmbedding(
|
| 784 |
self.qk_rope_head_dim,
|
| 785 |
max_position_embeddings=self.max_position_embeddings,
|
| 786 |
scaling_factor=scaling_factor,
|
|
|
|
| 903 |
return attn_output, attn_weights, past_key_value
|
| 904 |
|
| 905 |
|
| 906 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV2
|
| 907 |
+
class DeepseekV2FlashAttention2(DeepseekV2Attention):
|
| 908 |
"""
|
| 909 |
+
DeepseekV2 flash attention module. This module inherits from `DeepseekV2Attention` as the weights of the module stays
|
| 910 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
| 911 |
flash attention and deal with padding tokens in case the input contains any of them.
|
| 912 |
"""
|
|
|
|
| 929 |
use_cache: bool = False,
|
| 930 |
**kwargs,
|
| 931 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 932 |
+
# DeepseekV2FlashAttention2 attention does not support output_attentions
|
| 933 |
if "padding_mask" in kwargs:
|
| 934 |
warnings.warn(
|
| 935 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
|
|
| 1003 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
| 1004 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
| 1005 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
| 1006 |
+
# in fp32. (DeepseekV2RMSNorm handles it correctly)
|
| 1007 |
|
| 1008 |
input_dtype = query_states.dtype
|
| 1009 |
if input_dtype == torch.float32:
|
|
|
|
| 1079 |
if not self._flash_attn_uses_top_left_mask:
|
| 1080 |
causal = self.is_causal
|
| 1081 |
else:
|
| 1082 |
+
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV2FlashAttention2 __init__.
|
| 1083 |
causal = self.is_causal and query_length != 1
|
| 1084 |
|
| 1085 |
# Contains at least one padding token in the sequence
|
|
|
|
| 1174 |
|
| 1175 |
|
| 1176 |
ATTENTION_CLASSES = {
|
| 1177 |
+
"eager": DeepseekV2Attention,
|
| 1178 |
+
"flash_attention_2": DeepseekV2FlashAttention2,
|
| 1179 |
}
|
| 1180 |
|
| 1181 |
|
| 1182 |
+
class DeepseekV2DecoderLayer(nn.Module):
|
| 1183 |
+
def __init__(self, config: DeepseekV2Config, layer_idx: int):
|
| 1184 |
super().__init__()
|
| 1185 |
self.hidden_size = config.hidden_size
|
| 1186 |
|
|
|
|
| 1189 |
)
|
| 1190 |
|
| 1191 |
self.mlp = (
|
| 1192 |
+
DeepseekV2MoE(config)
|
| 1193 |
if (
|
| 1194 |
config.n_routed_experts is not None
|
| 1195 |
and layer_idx >= config.first_k_dense_replace
|
| 1196 |
and layer_idx % config.moe_layer_freq == 0
|
| 1197 |
)
|
| 1198 |
+
else DeepseekV2MLP(config)
|
| 1199 |
)
|
| 1200 |
+
self.input_layernorm = DeepseekV2RMSNorm(
|
| 1201 |
config.hidden_size, eps=config.rms_norm_eps
|
| 1202 |
)
|
| 1203 |
+
self.post_attention_layernorm = DeepseekV2RMSNorm(
|
| 1204 |
config.hidden_size, eps=config.rms_norm_eps
|
| 1205 |
)
|
| 1206 |
|
|
|
|
| 1267 |
return outputs
|
| 1268 |
|
| 1269 |
|
| 1270 |
+
DeepseekV2_START_DOCSTRING = r"""
|
| 1271 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 1272 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 1273 |
etc.)
|
|
|
|
| 1277 |
and behavior.
|
| 1278 |
|
| 1279 |
Parameters:
|
| 1280 |
+
config ([`DeepseekV2Config`]):
|
| 1281 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 1282 |
load the weights associated with the model, only the configuration. Check out the
|
| 1283 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
|
| 1285 |
|
| 1286 |
|
| 1287 |
@add_start_docstrings(
|
| 1288 |
+
"The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
|
| 1289 |
+
DeepseekV2_START_DOCSTRING,
|
| 1290 |
)
|
| 1291 |
+
class DeepseekV2PreTrainedModel(PreTrainedModel):
|
| 1292 |
+
config_class = DeepseekV2Config
|
| 1293 |
base_model_prefix = "model"
|
| 1294 |
supports_gradient_checkpointing = True
|
| 1295 |
+
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
| 1296 |
_skip_keys_device_placement = "past_key_values"
|
| 1297 |
_supports_flash_attn_2 = True
|
| 1298 |
_supports_sdpa = True
|
|
|
|
| 1310 |
module.weight.data[module.padding_idx].zero_()
|
| 1311 |
|
| 1312 |
|
| 1313 |
+
DeepseekV2_INPUTS_DOCSTRING = r"""
|
| 1314 |
Args:
|
| 1315 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1316 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
|
| 1381 |
|
| 1382 |
|
| 1383 |
@add_start_docstrings(
|
| 1384 |
+
"The bare DeepseekV2 Model outputting raw hidden-states without any specific head on top.",
|
| 1385 |
+
DeepseekV2_START_DOCSTRING,
|
| 1386 |
)
|
| 1387 |
+
class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
| 1388 |
"""
|
| 1389 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
|
| 1390 |
|
| 1391 |
Args:
|
| 1392 |
+
config: DeepseekV2Config
|
| 1393 |
"""
|
| 1394 |
|
| 1395 |
+
def __init__(self, config: DeepseekV2Config):
|
| 1396 |
super().__init__(config)
|
| 1397 |
self.padding_idx = config.pad_token_id
|
| 1398 |
self.vocab_size = config.vocab_size
|
|
|
|
| 1402 |
)
|
| 1403 |
self.layers = nn.ModuleList(
|
| 1404 |
[
|
| 1405 |
+
DeepseekV2DecoderLayer(config, layer_idx)
|
| 1406 |
for layer_idx in range(config.num_hidden_layers)
|
| 1407 |
]
|
| 1408 |
)
|
| 1409 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
| 1410 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
| 1411 |
+
self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 1412 |
|
| 1413 |
self.gradient_checkpointing = False
|
| 1414 |
# Initialize weights and apply final processing
|
|
|
|
| 1420 |
def set_input_embeddings(self, value):
|
| 1421 |
self.embed_tokens = value
|
| 1422 |
|
| 1423 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
| 1424 |
def forward(
|
| 1425 |
self,
|
| 1426 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 1580 |
)
|
| 1581 |
|
| 1582 |
|
| 1583 |
+
class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
|
| 1584 |
_tied_weights_keys = ["lm_head.weight"]
|
| 1585 |
|
| 1586 |
def __init__(self, config):
|
| 1587 |
super().__init__(config)
|
| 1588 |
+
self.model = DeepseekV2Model(config)
|
| 1589 |
self.vocab_size = config.vocab_size
|
| 1590 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 1591 |
|
|
|
|
| 1610 |
def get_decoder(self):
|
| 1611 |
return self.model
|
| 1612 |
|
| 1613 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
| 1614 |
@replace_return_docstrings(
|
| 1615 |
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
| 1616 |
)
|
|
|
|
| 1639 |
Example:
|
| 1640 |
|
| 1641 |
```python
|
| 1642 |
+
>>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
|
| 1643 |
|
| 1644 |
+
>>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
| 1645 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
| 1646 |
|
| 1647 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
|
| 1787 |
|
| 1788 |
@add_start_docstrings(
|
| 1789 |
"""
|
| 1790 |
+
The DeepseekV2 Model transformer with a sequence classification head on top (linear layer).
|
| 1791 |
|
| 1792 |
+
[`DeepseekV2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
| 1793 |
(e.g. GPT-2) do.
|
| 1794 |
|
| 1795 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
|
| 1798 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
| 1799 |
each row of the batch).
|
| 1800 |
""",
|
| 1801 |
+
DeepseekV2_START_DOCSTRING,
|
| 1802 |
)
|
| 1803 |
+
class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel):
|
| 1804 |
def __init__(self, config):
|
| 1805 |
super().__init__(config)
|
| 1806 |
self.num_labels = config.num_labels
|
| 1807 |
+
self.model = DeepseekV2Model(config)
|
| 1808 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
| 1809 |
|
| 1810 |
# Initialize weights and apply final processing
|
|
|
|
| 1816 |
def set_input_embeddings(self, value):
|
| 1817 |
self.model.embed_tokens = value
|
| 1818 |
|
| 1819 |
+
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
| 1820 |
def forward(
|
| 1821 |
self,
|
| 1822 |
input_ids: torch.LongTensor = None,
|