openbmb
/

MiniCPM4.1-8B-MLX

@@ -1,10 +1,5 @@
 # coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +17,6 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
@@ -111,8 +105,8 @@ class MiniCPMConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
-    model_type = "minicpm"
-    keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
@@ -122,7 +116,7 @@ class MiniCPMConfig(PretrainedConfig):
         num_hidden_layers=32,
         num_attention_heads=32,
         num_key_value_heads=None,
-        hidden_act="silu",
         max_position_embeddings=2048,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
@@ -139,8 +133,10 @@ class MiniCPMConfig(PretrainedConfig):
         scale_emb=1,
         dim_model_base=1,
         scale_depth=1,
-        **kwargs,
-    ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
@@ -166,6 +162,11 @@ class MiniCPMConfig(PretrainedConfig):
         self.scale_emb = scale_emb
         self.dim_model_base = dim_model_base
         self.scale_depth = scale_depth
         super().__init__(
             pad_token_id=pad_token_id,
@@ -176,7 +177,7 @@ class MiniCPMConfig(PretrainedConfig):
         )
         try:
             import flash_attn
-            self._attn_implementation = "flash_attention_2"
         except:
             pass
@@ -189,12 +190,12 @@ class MiniCPMConfig(PretrainedConfig):
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}"
             )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
             raise ValueError(
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )

 # coding=utf-8
+# Copyright 2025 The OpenBMB Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
     >>> configuration = model.config
     ```"""
+    model_type = 'minicpm'
+    keys_to_ignore_at_inference = ['past_key_values']
     def __init__(
         self,
         num_hidden_layers=32,
         num_attention_heads=32,
         num_key_value_heads=None,
+        hidden_act='silu',
         max_position_embeddings=2048,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         scale_emb=1,
         dim_model_base=1,
         scale_depth=1,
+        mup_denominator=32,
+        sparse_config=None,
+        **kwargs):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.scale_emb = scale_emb
         self.dim_model_base = dim_model_base
         self.scale_depth = scale_depth
+        # only used for Eagle Head
+        self.mup_denominator = mup_denominator
+        # sparse config
+        self.sparse_config = sparse_config
         super().__init__(
             pad_token_id=pad_token_id,
         )
         try:
             import flash_attn
+            self._attn_implementation = 'flash_attention_2'
         except:
             pass
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
             raise ValueError(
+                '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
+                f'got {self.rope_scaling}'
             )
+        rope_scaling_type = self.rope_scaling.get('type', None)
+        rope_scaling_factor = self.rope_scaling.get('factor', None)
+        if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']:
             raise ValueError(
                 f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
             )