Upload configuration_minicpm.py
Browse files- configuration_minicpm.py +19 -18
configuration_minicpm.py
CHANGED
|
@@ -1,10 +1,5 @@
|
|
| 1 |
# coding=utf-8
|
| 2 |
-
# Copyright
|
| 3 |
-
#
|
| 4 |
-
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
| 5 |
-
# and OPT implementations in this library. It has been modified from its
|
| 6 |
-
# original forms to accommodate minor architectural differences compared
|
| 7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
| 8 |
#
|
| 9 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 10 |
# you may not use this file except in compliance with the License.
|
|
@@ -22,7 +17,6 @@
|
|
| 22 |
from transformers.configuration_utils import PretrainedConfig
|
| 23 |
from transformers.utils import logging
|
| 24 |
|
| 25 |
-
|
| 26 |
logger = logging.get_logger(__name__)
|
| 27 |
|
| 28 |
MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
|
@@ -111,8 +105,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 111 |
>>> configuration = model.config
|
| 112 |
```"""
|
| 113 |
|
| 114 |
-
model_type =
|
| 115 |
-
keys_to_ignore_at_inference = [
|
| 116 |
|
| 117 |
def __init__(
|
| 118 |
self,
|
|
@@ -122,7 +116,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 122 |
num_hidden_layers=32,
|
| 123 |
num_attention_heads=32,
|
| 124 |
num_key_value_heads=None,
|
| 125 |
-
hidden_act=
|
| 126 |
max_position_embeddings=2048,
|
| 127 |
initializer_range=0.02,
|
| 128 |
rms_norm_eps=1e-6,
|
|
@@ -139,8 +133,10 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 139 |
scale_emb=1,
|
| 140 |
dim_model_base=1,
|
| 141 |
scale_depth=1,
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
| 144 |
self.vocab_size = vocab_size
|
| 145 |
self.max_position_embeddings = max_position_embeddings
|
| 146 |
self.hidden_size = hidden_size
|
|
@@ -166,6 +162,11 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 166 |
self.scale_emb = scale_emb
|
| 167 |
self.dim_model_base = dim_model_base
|
| 168 |
self.scale_depth = scale_depth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
super().__init__(
|
| 171 |
pad_token_id=pad_token_id,
|
|
@@ -176,7 +177,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 176 |
)
|
| 177 |
try:
|
| 178 |
import flash_attn
|
| 179 |
-
self._attn_implementation =
|
| 180 |
except:
|
| 181 |
pass
|
| 182 |
|
|
@@ -189,12 +190,12 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 189 |
|
| 190 |
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
| 191 |
raise ValueError(
|
| 192 |
-
|
| 193 |
-
f
|
| 194 |
)
|
| 195 |
-
rope_scaling_type = self.rope_scaling.get(
|
| 196 |
-
rope_scaling_factor = self.rope_scaling.get(
|
| 197 |
-
if rope_scaling_type is None or rope_scaling_type not in [
|
| 198 |
raise ValueError(
|
| 199 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
| 200 |
)
|
|
|
|
| 1 |
# coding=utf-8
|
| 2 |
+
# Copyright 2025 The OpenBMB Team. All rights reserved.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
#
|
| 4 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
# you may not use this file except in compliance with the License.
|
|
|
|
| 17 |
from transformers.configuration_utils import PretrainedConfig
|
| 18 |
from transformers.utils import logging
|
| 19 |
|
|
|
|
| 20 |
logger = logging.get_logger(__name__)
|
| 21 |
|
| 22 |
MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
|
|
|
| 105 |
>>> configuration = model.config
|
| 106 |
```"""
|
| 107 |
|
| 108 |
+
model_type = 'minicpm'
|
| 109 |
+
keys_to_ignore_at_inference = ['past_key_values']
|
| 110 |
|
| 111 |
def __init__(
|
| 112 |
self,
|
|
|
|
| 116 |
num_hidden_layers=32,
|
| 117 |
num_attention_heads=32,
|
| 118 |
num_key_value_heads=None,
|
| 119 |
+
hidden_act='silu',
|
| 120 |
max_position_embeddings=2048,
|
| 121 |
initializer_range=0.02,
|
| 122 |
rms_norm_eps=1e-6,
|
|
|
|
| 133 |
scale_emb=1,
|
| 134 |
dim_model_base=1,
|
| 135 |
scale_depth=1,
|
| 136 |
+
mup_denominator=32,
|
| 137 |
+
sparse_config=None,
|
| 138 |
+
**kwargs):
|
| 139 |
+
|
| 140 |
self.vocab_size = vocab_size
|
| 141 |
self.max_position_embeddings = max_position_embeddings
|
| 142 |
self.hidden_size = hidden_size
|
|
|
|
| 162 |
self.scale_emb = scale_emb
|
| 163 |
self.dim_model_base = dim_model_base
|
| 164 |
self.scale_depth = scale_depth
|
| 165 |
+
# only used for Eagle Head
|
| 166 |
+
self.mup_denominator = mup_denominator
|
| 167 |
+
|
| 168 |
+
# sparse config
|
| 169 |
+
self.sparse_config = sparse_config
|
| 170 |
|
| 171 |
super().__init__(
|
| 172 |
pad_token_id=pad_token_id,
|
|
|
|
| 177 |
)
|
| 178 |
try:
|
| 179 |
import flash_attn
|
| 180 |
+
self._attn_implementation = 'flash_attention_2'
|
| 181 |
except:
|
| 182 |
pass
|
| 183 |
|
|
|
|
| 190 |
|
| 191 |
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
|
| 192 |
raise ValueError(
|
| 193 |
+
'`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
|
| 194 |
+
f'got {self.rope_scaling}'
|
| 195 |
)
|
| 196 |
+
rope_scaling_type = self.rope_scaling.get('type', None)
|
| 197 |
+
rope_scaling_factor = self.rope_scaling.get('factor', None)
|
| 198 |
+
if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
|
| 199 |
raise ValueError(
|
| 200 |
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
|
| 201 |
)
|