Update ckpt

Browse files

Files changed (7) hide show

config.json +36 -37
configuration_FalconTST.py +62 -64
generation_config.json +0 -4
model-00002-of-00002.safetensors +2 -2
model.safetensors.index.json +291 -290
modeling_FalconTST.py +179 -428
ts_generation_mixin.py +0 -89

config.json CHANGED Viewed

@@ -1,58 +1,57 @@
 {
   "_name_or_path": "FalconTST",
   "architectures": [
     "FalconTSTForPrediction"
   ],
   "auto_map": {
     "AutoConfig": "configuration_FalconTST.FalconTSTConfig",
-    "AutoModelForCausalLM": "modeling_FalconTST.FalconTSTForPrediction"
   },
-  "disable_bias_linear": false,
-  "do_base_forecast": false,
-  "do_expert_forecast": true,
-  "expert_num_layers": 4,
-  "ffn_hidden_size": 4096,
-  "heterogeneous_moe_layer": false,
   "hidden_size": 1024,
-  "init_method_std": 0.06,
-  "is_revin": true,
-  "k_layernorm": false,
-  "kv_channels": 64,
-  "mask_pad_value": 255.0,
-  "model_type": "FalconTST",
-  "moe_expert_final_layernorm": true,
-  "moe_ffn_hidden_size": 4096,
-  "moe_router_enable_expert_bias": false,
-  "moe_router_input_size": 2880,
-  "moe_router_pre_softmax": true,
-  "moe_router_score_function": "softmax",
-  "moe_router_topk": 1,
-  "moe_shared_expert_intermediate_size": 4096,
-  "multi_forecast_head_list": [
-    24,
-    96,
-    336
-  ],
   "num_attention_heads": 16,
-  "num_hidden_layers": 2,
-  "num_moe_experts": 4,
-  "torch_dtype": "bfloat16",
   "patch_size_list": [
     120,
     96,
     64,
     36
   ],
-  "pred_length": 336,
-  "q_layernorm": false,
   "residual_backcast": true,
   "rotary_base": 1000000,
   "rotary_interleaved": false,
-  "seq_length": 2880,
-  "shared_patch_size": 32,
-  "tie_word_embeddings": false,
   "transformer_input_layernorm": true,
-  "transformers_version": "4.40.1",
-  "use_cache": true,
-  "use_cpu_initialization": true
 }

 {
   "_name_or_path": "FalconTST",
+  "model_type": "FalconTST",
+  "transformers_version": "4.40.1",
   "architectures": [
     "FalconTSTForPrediction"
   ],
   "auto_map": {
     "AutoConfig": "configuration_FalconTST.FalconTSTConfig",
+    "AutoModel": "modeling_FalconTST.FalconTSTForPrediction"
   },
+  "add_bias_linear": false,
+  "num_hidden_layers": 2,
   "hidden_size": 1024,
+  "ffn_hidden_size": 4096,
   "num_attention_heads": 16,
+  "seq_length": 2880,
+  "mask_pad_value": 255.0,
+  "is_revin": true,
+  "shared_patch_size": 32,
   "patch_size_list": [
     120,
     96,
     64,
     36
   ],
   "residual_backcast": true,
+  "do_base_forecast": false,
+  "do_expert_forecast": true,
+  "heterogeneous_moe_layer": false,
+  "expert_num_layers": 4,
+  "multi_forecast_head_list": [
+    24,
+    96,
+    336
+  ],
+  "multi_forecast_head_type": "single",
   "rotary_base": 1000000,
   "rotary_interleaved": false,
+  "q_layernorm": false,
+  "k_layernorm": false,
   "transformer_input_layernorm": true,
+  "num_experts": 4,
+  "moe_router_topk": 1,
+  "moe_router_pre_softmax": true,
+  "moe_router_score_function": "softmax",
+  "moe_ffn_hidden_size": 4096,
+  "moe_shared_expert_intermediate_size": 4096,
+  "moe_router_enable_expert_bias": false,
+  "moe_expert_final_layernorm": true,
+  "use_cpu_initialization": true,
+  "init_method_std": 0.06,
+  "use_cache": true
 }

configuration_FalconTST.py CHANGED Viewed

@@ -100,112 +100,110 @@ class FalconTSTConfig(PretrainedConfig):
     """
     model_type = "FalconTST"
-    keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
-        hidden_size: int = 1024,
-        ffn_hidden_size: int = 4096,
-        seq_length: int = 2880,
         add_bias_linear: bool = False,
-        rope_theta: int = 10000,
         num_hidden_layers: int = 3,
         num_attention_heads: int = 16,
         mask_pad_value: float = 255.0,
-        expert_num_layers: int = 4,
-        shared_patch_size: int = 64,
-        patch_size_list: Optional[List[int]] = None,
-        multi_forecast_head_list: Optional[List[int]] = None,
         is_revin: bool = True,
-        use_cpu_initialization: bool = False,
-        rotary_interleaved: bool = False,
-        do_expert_forecast: bool = True,
         residual_backcast: bool = True,
         do_base_forecast: bool = False,
-        heterogeneous_moe_layer: bool = True,
-        test_data_seq_len: int = 2880,
-        test_data_test_len: int = 720,
-        autoregressive_step_list: Optional[List[int]] = None,
         multi_forecast_head_type: str = "single",
         num_experts: int = 4,
         moe_router_topk: int = 2,
         moe_ffn_hidden_size: int = 4096,
         moe_shared_expert_intermediate_size: int = 4096,
-        init_method_std: float = 0.06,
-        initializer_range: float = 0.02,
         moe_router_enable_expert_bias: bool = False,
         moe_expert_final_layernorm: bool = True,
-        transformer_input_layernorm: bool = True,
-        moe_router_pre_softmax: bool = True,
-        q_layernorm: bool = False,
-        k_layernorm: bool = False,
-        moe_router_score_function: str = "softmax",
-        tie_word_embeddings: bool = False,
         **kwargs,
     ):
         """Initialize FalconTST configuration."""
-        # Set default values for list parameters
-        if patch_size_list is None:
-            patch_size_list = [96, 64, 48, 24]
-        if multi_forecast_head_list is None:
-            multi_forecast_head_list = [24, 96, 336]
-        if autoregressive_step_list is None:
-            autoregressive_step_list = [2, 4, 1]
-        # FalconTST inference specific
-        self.test_data_seq_len = test_data_seq_len
-        self.inference_length = test_data_test_len
-        self.autoregressive_step_list = autoregressive_step_list
-        self.multi_forecast_head_type = multi_forecast_head_type
-        self.use_cache = True
-        # FalconTST specific
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.num_attention_heads = num_attention_heads
-        self.init_method_std = init_method_std
-        self.initializer_range = initializer_range
         self.seq_length = seq_length
-        self.multi_forecast_head_list = multi_forecast_head_list
-        self.kv_channels=self.hidden_size // self.num_attention_heads
-        self.rotary_base = rope_theta
-        self.num_hidden_layers = num_hidden_layers
         self.mask_pad_value = mask_pad_value
-        self.pred_length = max(self.multi_forecast_head_list)
-        self.add_bias_linear = add_bias_linear
         self.is_revin = is_revin
         self.do_base_forecast = do_base_forecast
         self.do_expert_forecast = do_expert_forecast
-        self.residual_backcast = residual_backcast
         self.heterogeneous_moe_layer = heterogeneous_moe_layer
-        self.use_cpu_initialization = use_cpu_initialization
         self.rotary_interleaved = rotary_interleaved
-        # expert specific
-        self.patch_size_list = patch_size_list
         self.num_moe_experts = num_experts
-        self.shared_patch_size = shared_patch_size
-        self.expert_num_layers = expert_num_layers
-        self.moe_router_input_size = self.seq_length
         self.moe_router_topk = moe_router_topk
         self.moe_router_score_function = moe_router_score_function
         self.moe_ffn_hidden_size = moe_ffn_hidden_size
-        self.moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size
         self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
         self.moe_expert_final_layernorm = moe_expert_final_layernorm
-        self.transformer_input_layernorm = transformer_input_layernorm
-        self.moe_router_pre_softmax = moe_router_pre_softmax
-        self.q_layernorm = q_layernorm
-        self.k_layernorm = k_layernorm
-        kwargs.pop('tie_word_embeddings', None)
         super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

     """
     model_type = "FalconTST"
     def __init__(
         self,
+        # model configs
         add_bias_linear: bool = False,
         num_hidden_layers: int = 3,
+        hidden_size: int = 1024,
+        ffn_hidden_size: int = 4096,
         num_attention_heads: int = 16,
+        seq_length: int = 2880,
         mask_pad_value: float = 255.0,
         is_revin: bool = True,
+        shared_patch_size: int = 32,
+        patch_size_list: Optional[List[int]] = None,
         residual_backcast: bool = True,
         do_base_forecast: bool = False,
+        do_expert_forecast: bool = True,
+        heterogeneous_moe_layer: bool = False,
+        expert_num_layers: int = 4,
+        multi_forecast_head_list: Optional[List[int]] = None,
         multi_forecast_head_type: str = "single",
+        rope_theta: int = 1000000,
+        rotary_interleaved: bool = False,
+        block_input_layernorm: bool = True,
+        # moe configs
         num_experts: int = 4,
         moe_router_topk: int = 2,
+        moe_router_pre_softmax: bool = True,
+        moe_router_score_function: str = "softmax",
         moe_ffn_hidden_size: int = 4096,
         moe_shared_expert_intermediate_size: int = 4096,
         moe_router_enable_expert_bias: bool = False,
         moe_expert_final_layernorm: bool = True,
+        # initial configs
+        use_cpu_initialization: bool = False,
+        init_method_std: float = 0.06,
+        initializer_range: float = 0.02,
+        # test configs
+        test_data_seq_len: int = 2880,
+        test_data_test_len: int = 720,
+        autoregressive_step_list: Optional[List[int]] = None,
         **kwargs,
     ):
         """Initialize FalconTST configuration."""
+        # model configs
+        self.add_bias_linear = add_bias_linear
+        self.num_hidden_layers = num_hidden_layers
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.num_attention_heads = num_attention_heads
+        self.kv_channels = self.hidden_size // self.num_attention_heads
         self.seq_length = seq_length
         self.mask_pad_value = mask_pad_value
         self.is_revin = is_revin
+        self.shared_patch_size = shared_patch_size
+        if patch_size_list is None:
+            patch_size_list = [96, 64, 48, 24]
+        self.patch_size_list = patch_size_list
+        self.residual_backcast = residual_backcast
         self.do_base_forecast = do_base_forecast
         self.do_expert_forecast = do_expert_forecast
         self.heterogeneous_moe_layer = heterogeneous_moe_layer
+        self.expert_num_layers = expert_num_layers
+        if multi_forecast_head_list is None:
+            multi_forecast_head_list = [24, 96, 336]
+        self.multi_forecast_head_list = multi_forecast_head_list
+        self.pred_length = max(self.multi_forecast_head_list)
+        self.multi_forecast_head_type = multi_forecast_head_type
+        self.rotary_base = rope_theta
         self.rotary_interleaved = rotary_interleaved
+        self.block_input_layernorm = block_input_layernorm
+        # moe configs
         self.num_moe_experts = num_experts
         self.moe_router_topk = moe_router_topk
+        self.moe_router_input_size = self.seq_length
+        self.moe_router_pre_softmax = moe_router_pre_softmax
         self.moe_router_score_function = moe_router_score_function
         self.moe_ffn_hidden_size = moe_ffn_hidden_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
         self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
         self.moe_expert_final_layernorm = moe_expert_final_layernorm
+        # initial configs
+        self.use_cpu_initialization = use_cpu_initialization
+        self.init_method_std = init_method_std
+        self.initializer_range = initializer_range
+        # test configs
+        self.test_data_seq_len = test_data_seq_len
+        self.inference_length = test_data_test_len
+        if autoregressive_step_list is None:
+            autoregressive_step_list = [2, 4, 1]
+        self.autoregressive_step_list = autoregressive_step_list
+        self.use_cache = True
         super().__init__(
             **kwargs,
         )

generation_config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "_from_model_config": true,
-  "transformers_version": "4.40.1"
-}

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5a15d1fcb6388aed06deb70f77918cd38899476dc0c4b1ac7dc57391cf8a477
-size 1264771376

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a7689d19b8af45f5261b86f22d6d57ffa6feb1690170c1afc6a43d8be8f46ca
+size 1264777232

model.safetensors.index.json CHANGED Viewed

@@ -1,292 +1,293 @@
 {
-    "metadata": {
-        "total_size": 4983109888
-    },
-    "weight_map": {
-        "model.decoder.layers.0.router.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.experts.local_experts.3.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.0.shared_experts.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.router.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
-        "model.decoder.layers.1.experts.local_experts.3.output_layer.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.final_layernorm.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc1.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc2.weight": "model-00002-of-00002.safetensors",
-        "model.decoder.layers.1.shared_experts.output_layer.weight": "model-00002-of-00002.safetensors",
-        "model.output_layer.weight": "model-00002-of-00002.safetensors"
-    }
 }

 {
+  "metadata": {
+    "total_size": 4983115648
+  },
+  "weight_map": {
+    "model.decoder.layers.0.router.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.experts.local_experts.3.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.0.shared_experts.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.router.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+    "model.decoder.layers.1.experts.local_experts.3.output_layer.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.final_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc1.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc2.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.layers.1.shared_experts.output_layer.weight": "model-00002-of-00002.safetensors",
+    "model.decoder.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.output_layer.weight": "model-00002-of-00002.safetensors"
+  }
 }

modeling_FalconTST.py CHANGED Viewed

@@ -10,7 +10,6 @@ from einops import rearrange, repeat
 from functools import reduce
 from abc import ABC, abstractmethod
 from .configuration_FalconTST import FalconTSTConfig
-from .ts_generation_mixin import FalconTSTGenerationMixin
 from transformers import PreTrainedModel, Cache, DynamicCache
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@@ -74,63 +73,6 @@ def _apply_rotary_pos_emb_bshd(
     return torch.cat((t, t_pass), dim=-1)
-def topk_softmax_with_capacity(
-    logits: torch.Tensor,
-    topk: int,
-    use_pre_softmax: bool = False,
-    score_function: str = "softmax",
-    expert_bias: Optional[torch.Tensor] = None,
-):
-    """Apply capacity and padding to the top-k selection.
-    Args:
-        logits (torch.Tensor): Logits tensor.
-        topk (int): The number of experts to select for each token.
-        use_pre_softmax (bool): Whether to apply softmax or sigmoid before top-k selection.
-        score_function (str): The score function to use. Can be either "softmax" or "sigmoid".
-        expert_bias (torch.Tensor): The bias added to logits for expert routing.
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
-              the routing probabilities for each token to each expert.
-            - routing_map (torch.Tensor): A mask tensor of shape [num_tokens, num_experts]
-              indicating which experts were selected for each token. True values represent
-              the selected experts.
-            - tokens_per_expert (torch.Tensor): A tensor of shape [num_experts] containing
-              the number of local tokens assigned to each expert before dropping and padding.
-    """
-    assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
-    def compute_topk(scores, topk,):
-        return torch.topk(scores, k=topk, dim=1)
-    if score_function == "softmax":
-        if use_pre_softmax:
-            scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
-            probs, top_indices = compute_topk(scores, topk, )
-        else:
-            scores, top_indices = compute_topk(logits, topk, )
-            probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
-    elif score_function == "sigmoid":
-        scores = torch.sigmoid(logits.float()).type_as(logits)
-        if expert_bias is not None:
-            scores_for_routing = scores + expert_bias
-            _, top_indices = compute_topk(scores_for_routing, topk, )
-            scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
-        else:
-            scores, top_indices = compute_topk(scores, topk,)
-        probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if topk > 1 else scores
-    else:
-        raise ValueError(f"Invalid score_function: {score_function}")
-    # TODO Try using element-wise operations instead of scatter?
-    topk_masked_gates = torch.zeros_like(logits).scatter(1, top_indices, probs)
-    topk_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
-    # TODO: Reset topk_map to realize load-balancing?
-    tokens_per_expert = topk_map.sum(dim=0)
-    return topk_masked_gates, topk_map, tokens_per_expert
 class RotaryEmbedding(nn.Module):
     """Rotary Embedding.
@@ -156,7 +98,10 @@ class RotaryEmbedding(nn.Module):
         dim = kv_channels
         self.rotary_interleaved = rotary_interleaved
-        device = 'cpu' if use_cpu_initialization else torch.cuda.current_device()
         self.inv_freq = 1.0 / (
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
@@ -225,11 +170,6 @@ class IdentityOp(nn.Module):
         return x
-class IdentityFuncOp(nn.Module):
-    def forward(self, x):
-        return x
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
@@ -264,24 +204,21 @@ class TEDotProductAttention(nn.Module):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
-    def forward(self, q,k,v,attention_mask,causal=None, ):
         """Implements the multihead softmax attention.
         Arguments
         ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
-            causal: if passed, will override self.causal
-            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
-                False means to mask out. (B, S)
         """
-        causal = self.causal if causal is None else causal
         q = q.transpose(0,1).contiguous()
         k = k.transpose(0,1).contiguous()
         v = v.transpose(0,1).contiguous()
         batch_size, seq_len = q.shape[0], q.shape[1]
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
-        # scores
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         scores = scores.masked_fill(attention_mask == 0, float('-1e9'))
         # Softmax
@@ -289,42 +226,37 @@ class TEDotProductAttention(nn.Module):
         # Dropout
         attention_drop = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
-        output = output.reshape(batch_size, seq_len, -1).transpose(0,1).contiguous()
-        return output
 class SelfAttention(nn.Module):
     def __init__(self,config,):
         super().__init__()
         self.config = config
-        q_layernorm=config.q_layernorm
-        k_layernorm=config.k_layernorm
         self.hidden_size = config.hidden_size
         self.core_attention = TEDotProductAttention()
         self.linear_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.add_bias_linear,)
         self.linear_qkv =  nn.Linear(self.hidden_size, 3*self.hidden_size, bias=config.add_bias_linear,)
-        if q_layernorm:
-            self.q_layernorm = RMSNorm(self.hidden_size)
-        else:
-            self.q_layernorm = IdentityOp()
-        if k_layernorm:
-            self.k_layernorm = RMSNorm(self.hidden_size)
-        else:
-            self.k_layernorm = IdentityOp()
-    def forward(self, x, attention_mask,rotary_pos_emb):
         qkv = self.linear_qkv(x)
-        qkv = qkv.view(qkv.size(0), qkv.size(1), self.config.num_attention_heads,-1)
         q, k, v = qkv.chunk(3, dim=-1)
         # Apply rotary encoding to q and k
         rotary_pos_emb = (rotary_pos_emb,) * 2
         q_pos_emb, k_pos_emb = rotary_pos_emb
         q = _apply_rotary_pos_emb_bshd(q, q_pos_emb)
         k = _apply_rotary_pos_emb_bshd(k, k_pos_emb)
-        q = self.q_layernorm(q)
-        k = self.k_layernorm(k)
         # attention
         attn_output = self.core_attention(q, k, v, attention_mask)
         output = self.linear_proj(attn_output)
@@ -333,7 +265,7 @@ class SelfAttention(nn.Module):
 class MLP(nn.Module):
-    def __init__(self,config,in_features):
         super().__init__()
         self.config= config
         self.linear_fc1 = nn.Linear(in_features, self.config.moe_ffn_hidden_size*2, bias=self.config.add_bias_linear,)
@@ -367,9 +299,14 @@ class TransformerLayer(nn.Module):
             self.input_layernorm = IdentityOp()
         self.self_attention = SelfAttention(config)
         self.pre_mlp_layernorm = RMSNorm(self.config.hidden_size)
-        self.mlp = MLP(config,self.config.hidden_size)
-    def forward(self, x, attention_mask,rotary_pos_emb):
         residual = x
         x = self.input_layernorm(x)
         x = self.self_attention(x, attention_mask, rotary_pos_emb)
@@ -425,7 +362,7 @@ class FalconTSTExpert(nn.Module):
         # Patchify the input
         input_data = input_data.unfold(dimension=-1, size=self.patch_size, step=self.patch_size).contiguous() # input [batch_size, patch_num, patch_size]
-        hidden_states= self.patch_embedding(input_data)              # hidden_states [batch_size, patch_num, hidden_size]
         hidden_states = hidden_states.transpose(0, 1).contiguous()      # hidden_states [patch_num, batch_size, hidden_size], To adapt to the Megatron
         # Patchify the mask: only the entire time points in a patch are masked then this patch is masked
@@ -436,16 +373,13 @@ class FalconTSTExpert(nn.Module):
         attention_mask = attention_mask.unsqueeze(2).repeat(1,1,patch_num) * attention_mask.unsqueeze(1).repeat(1,patch_num,1)  # [batch_size, patch_num, patch_num]
         attention_mask = attention_mask.unsqueeze(1).contiguous()   # [batch_size, 1, patch_num, patch_num]
         return hidden_states, attention_mask, input_mask
-    def _forward_output(self, hidden_states, output_scale=None, input_mask=None, inference_context=None):
         """
             Perform a forward pass through the output layer.
             Args:
-                expert_input (Tensor): Expert input of shape [batch_size, seq_len]
                 hidden_states (Tensor): Transformed hidden states of shape [patch_num, batch_size, hidden_size]
                 output_scale (Tensor, optional): Expert probabilities for the output layer  [batch_size]
                 input_mask (Tensor, optional): Expert input mask of shape [batch_size, seq_len], 0:mask, 1:unmask
@@ -466,11 +400,17 @@ class FalconTSTExpert(nn.Module):
         return expert_output
-    def forward(self, expert_input, rotary_pos_emb,expert_probs=None):
         hidden_states, attention_mask, input_mask = self._forward_patch_embedding(expert_input)
         for layer in self.layers:
-            hidden_states = layer(hidden_states,attention_mask,rotary_pos_emb[:hidden_states.shape[0]])
         hidden_states = self.final_layernorm(hidden_states)
         expert_output = self._forward_output(hidden_states, expert_probs, input_mask)
         return expert_output
@@ -512,174 +452,47 @@ class SequentialFalconTST(nn.Module):
         return expert_output
-class RouterGatingLinearFunction(torch.autograd.Function):
-    """
-    Autograd function for router gating linear.
-    """
-    @staticmethod
-    def forward(ctx, inp: torch.Tensor, weight: torch.Tensor, router_dtype: torch.dtype):
-        """
-        Forward pass of the RouterGatingLinearFunction function.
-        """
-        ctx.router_dtype = router_dtype
-        ctx.input_dtype = inp.dtype
-        ctx.weight_dtype = weight.dtype
-        inp_shape = inp.shape
-        inp = inp.view(-1, inp_shape[-1])
-        output = torch.mm(inp.to(router_dtype), weight.to(router_dtype).t())
-        output = output.view(*inp_shape[:-1], -1)
-        return output
-def router_gating_linear(inp: torch.Tensor, weight: torch.Tensor, router_dtype: torch.dtype):
-    """
-    Customized linear layer for router gating.
-    This linear layer accepts bfloat16 input and weight, and can return output with router_dtype.
-    It can reduce the memory usage by avoiding saving the intermediate high precision tensors.
-    """
-    return RouterGatingLinearFunction.apply(inp, weight, router_dtype)
-class Router(ABC,nn.Module):
-    """Base Router class"""
-    def __init__(
-        self, config: FalconTSTConfig,
-    ) -> None:
-        """
-        Initialize the Router module.
-        Args:
-            config (TransformerConfig): Configuration object for the Transformer model.
-            model_comm_pgs (ModelCommProcessGroups, optional): Process groups for MoE operations.
-        """
         super().__init__()
         self.config = config
-        # Initialize the gate weights.
-        if self.config.patch_size_list is not None:
-            assert self.config.moe_router_input_size is not None
-            self.weight = torch.nn.Parameter(
-                torch.empty((self.config.num_moe_experts, self.config.moe_router_input_size), dtype=torch.float32)
-            )
-        else:
-            self.weight = torch.nn.Parameter(
-                torch.empty((self.config.num_moe_experts, self.config.hidden_size), dtype=torch.float32)
-            )
         self.reset_parameters()
-    def reset_parameters(self):
-        """Reset the router parameters."""
-        torch.nn.init.normal_(self.weight,mean=0,std=self.config.init_method_std)
-        self.weight.data = self.weight.data.to(dtype=self.config.torch_dtype)
-    def gating(self, input: torch.Tensor):
-        """Forward pass of the router gate.
-        Args:
-            input (torch.Tensor): Input tensor.
-        Returns:
-            torch.Tensor: Logits tensor.
-        """
-        if self.weight.device != input.device:
-            self.weight = self.weight.to(input.device)
-        router_dtype = input.dtype
-        logits = router_gating_linear(input, self.weight, router_dtype)
-        return logits
-    @abstractmethod
     def routing(self, logits: torch.Tensor):
-        """Routing function.
-        Args:
-            logits (torch.Tensor): Logits tensor.
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: A tuple containing token assignment
-            probabilities and mapping.
-        """
-        raise NotImplementedError("Routing function not implemented.")
-    @abstractmethod
-    def forward(self, input: torch.Tensor):
-        """
-        Forward pass of the router.
-        Args:
-            input (torch.Tensor): Input tensor.
-        """
-        raise NotImplementedError("Forward function not implemented.")
-class TopKRouter(Router):
-    """Route each token to the top-k experts."""
-    def __init__(
-        self, config: FalconTSTConfig,
-    ) -> None:
-        """Initialize the zero token dropping router.
-        Args:
-            config (TransformerConfig): The configuration for the transformer model.
-            model_comm_pgs (ModelCommProcessGroups, optional): Process groups for MoE operations.
-        """
-        super().__init__(config=config)
-        self.topk = self.config.moe_router_topk
-        self.score_function = self.config.moe_router_score_function
-        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
-        if self.enable_expert_bias:
-            self.register_buffer(
-                'local_tokens_per_expert',
-                torch.zeros(self.config.num_moe_experts, dtype=torch.float32),
-                persistent=False,
-            )
-            self.register_buffer(
-                'expert_bias', torch.zeros(self.config.num_moe_experts, dtype=torch.float32)
-            )
         else:
-            self.local_tokens_per_expert = None
-            self.expert_bias = None
-    def routing(self, logits: torch.Tensor):
-        """Top-k routing function
-        Args:
-            logits (torch.Tensor): Logits tensor after gating.
-        Returns:
-            probs (torch.Tensor): The probabilities of token to experts assignment.
-            routing_map (torch.Tensor): The mapping of token to experts assignment,
-                with shape [num_tokens, num_experts].
-        """
-        logits = logits.view(-1, self.config.num_moe_experts)
-        scores, routing_map, tokens_per_expert = topk_softmax_with_capacity(
-            logits,
-            self.topk,
-            use_pre_softmax=self.config.moe_router_pre_softmax,
-            score_function=self.score_function,
-            expert_bias=self.expert_bias,
-        )
-        return scores, routing_map
     def forward(self, input: torch.Tensor):
-        """
-        Forward pass of the router.
-        Args:
-            input (torch.Tensor): Input tensor.
-        """
-        logits = self.gating(input)
-        scores, routing_map = self.routing(logits)
         return scores, routing_map
@@ -702,8 +515,8 @@ class FalconTSTMoELayer(nn.Module):
                 self.expert_output_size = config.seq_length
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
-                # If heterogeneous_moe_layer is True, the backcast will be None
-                self.backcast_layernorm = None
         else:
             self.backcast_layernorm = RMSNorm(self.seq_length)
@@ -784,42 +597,9 @@ class FalconTSTMoELayer(nn.Module):
         # permuted_probs (global_probs):            [num_permuted_samples_after_dispatch_postprocess(sorted)]
         experts_output = self.experts(input, routing_map, rotary_pos_emb, probs)
         return experts_output, shared_experts_output
-    def postprocess(
-        self,
-        backcast: torch.Tensor,         # [batch_size, seq_len]
-        forecast: torch.Tensor,         # [batch_size, pred_len]
-        output_backcast: torch.Tensor,  # [batch_size, seq_len]
-        output_forecast: torch.Tensor,  # [batch_size, pred_len]
-    ):
-        """
-        Args:
-            backcast (torch.Tensor): The previous layer's backcast time series (samples).                   [batch_size, seq_len]
-            forecast (torch.Tensor): The previous layer's forecast time series (samples).                   [batch_size, pred_len]
-            output_backcast (torch.Tensor): The current layer's output backcast time series (samples).      [batch_size, seq_len]
-            output_forecast (torch.Tensor): The current layer's output forecast time series (samples).      [batch_size, pred_len]
-            means (torch.Tensor): The means of the non-masked backcast time series (samples).               [batch_size, 1]
-            stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples).  [batch_size, 1]
-            backcast_mask (torch.Tensor): The previous layer's backcast mask of time series (samples) .     [batch_size, seq_len]
-        """
-        if output_backcast is not None:
-            # 25/8/14 @modified by xiaming replace the revin with layernorm after the moe layer
-            # And if we multiply the output_backcast with the input mask, the performance will be hurted
-            output_backcast = self.backcast_layernorm(output_backcast) # LayerNorm
-            if self.config.residual_backcast:
-                output_backcast = backcast - output_backcast
-            output_backcast[~self.input_mask] = self.config.mask_pad_value   # Important! Recover the mask time point back to mask_pad_value(default:255.)
-        if self.config.do_expert_forecast and forecast is not None: # The first layer's forecast is None
-            output_forecast = forecast + output_forecast
-        return output_backcast, output_forecast
     def combine(
         self,
         experts_output: torch.Tensor,
@@ -828,8 +608,7 @@ class FalconTSTMoELayer(nn.Module):
         """Combines expert outputs via communication and adds shared expert output.
         This method uses the time series(sample) dispatcher to combine the outputs from different
-        experts (e.g., via an All-to-All communication). It then adds the output
-        from the shared expert if it exists.
         """
         assert experts_output.shape == shared_experts_output.shape,\
              f'experts_output shape {experts_output.shape} doesn\'t equal to shared_experts_output shape:{shared_experts_output.shape}'
@@ -854,7 +633,36 @@ class FalconTSTMoELayer(nn.Module):
         return output_backcast, output_forecast
-    def forward(self, backcast,forecast,rotary_pos_emb):
         inputs, probs, residual, routing_map = self.router_and_preprocess(backcast)
         experts_output, shared_experts_output = self.experts_compute(inputs, probs, residual, rotary_pos_emb, routing_map)
         output_backcast, output_forecast = self.combine(experts_output, shared_experts_output)
@@ -862,20 +670,31 @@ class FalconTSTMoELayer(nn.Module):
         return output_backcast, output_forecast
 class FalconTSTBlock(nn.Module):
-    def __init__(self,config):
         super().__init__()
         self.config = config
         self.layers = nn.ModuleList([
-                FalconTSTMoELayer(config,layer_num +1)
-                for layer_num in range(self.config.num_hidden_layers)
-            ])
-    def forward(self, x,rotary_pos_emb):
         backcast = x
         forecast = None
         for layer in self.layers:
-            backcast, forecast = layer(backcast,forecast,rotary_pos_emb)
         return backcast,forecast
@@ -900,24 +719,28 @@ class FalconTSTPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class FalconTSTModel(FalconTSTPreTrainedModel):
     def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
-        self.seq_length = config.seq_length
         self.rotary_pos_emb = RotaryEmbedding(
-                kv_channels=self.config.kv_channels,
-                rotary_base=config.rotary_base,
-                use_cpu_initialization=self.config.use_cpu_initialization,
-                rotary_interleaved=self.config.rotary_interleaved
         )
         self.decoder = FalconTSTBlock(
-            config=config
-            )
         if self.config.do_expert_forecast and self.config.heterogeneous_moe_layer:
             self.output_layer = IdentityOp()
         else:
-            self.output_layer = nn.Linear(in_features=self.seq_length, out_features=self.config.pred_length, bias=self.config.add_bias_linear,)
     def revin(
@@ -946,13 +769,8 @@ class FalconTSTModel(FalconTSTPreTrainedModel):
         return input, means, stdev
     def forward(self, input, revin):
-        # Apply rotary position embeddings
-        # seq_len = patches.size(1)
-        # pos_emb = self.rotary_pos_emb(seq_len, patches.device)
-        # patches = patches + pos_emb
         batch_size, input_len = input.shape
-        # @created by xiaming @modified by baichun
         # realize varied input length
         if input_len > self.seq_length:
             input = input[:, -self.seq_length:]
@@ -972,7 +790,7 @@ class FalconTSTModel(FalconTSTPreTrainedModel):
         rotary_pos_emb = self.rotary_pos_emb(input_len, device=input.device)
         # Step3. Do one-step inference to get mixed forecasts from multiple forecast heads
-        # mixed_pred: [batch_size, sum(multi_forecast_head)]
         mixed_pred = self._inference_step(
             input=input,
             input_mask=input_mask,
@@ -1005,12 +823,12 @@ class FalconTSTModel(FalconTSTPreTrainedModel):
         rotary_pos_emb,
     ):
         if self.config.do_base_forecast:
-            base_forecast, _ = self.base_output_layer(input)
         else:
             base_forecast = None
         decoder_backcast, decoder_forecast = self.decoder(
-            input,                        # [batch_size, seq_len]
             rotary_pos_emb,      # [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         )
@@ -1019,12 +837,12 @@ class FalconTSTModel(FalconTSTPreTrainedModel):
             if self.config.heterogeneous_moe_layer:
                 decoder_forecast = self.output_layer(decoder_forecast)  # IdentityOp
             else:
-                final_forecast= self.output_layer(decoder_backcast *  input_mask)
                 decoder_forecast = decoder_forecast + final_forecast
         else:
             # The decoder_backcast contains the mask_pad_val(default:255.)
             decoder_forecast, _ = self.output_layer(decoder_backcast * input_mask)
         if self.config.do_base_forecast:
             assert base_forecast is not None, f'base_forecast is None'
             FalconTST_forecast = base_forecast + decoder_forecast
@@ -1080,129 +898,62 @@ class FalconTSTModel(FalconTSTPreTrainedModel):
             final_output = final_output[:, :self.config.inference_length]
-        elif auto_regressive_strategy == 'from_short_to_long':
-            # From short to long
-            # in validate_args, it has been sorted, and check the valid config
-            multi_forecast_head_list = sorted(self.config.multi_forecast_head_list)
-            multi_forecast_head_dict = {}
-            for idx, head_pred_len in enumerate(self.config.multi_forecast_head_list):
-                if idx == len(multi_forecast_head_list) - 1:
-                    ar_step = math.ceil(self.config.inference_length / head_pred_len)
-                else:
-                    ar_step = min(
-                        self.config.autoregressive_step_list[idx],
-                        self.config.multi_forecast_head_list[idx + 1] // self.config.multi_forecast_head_list[idx]
-                    )
-                    # ar_step = multi_forecast_head_list[idx + 1] // multi_forecast_head_list[idx]
-                multi_forecast_head_dict[head_pred_len] = ar_step
-            # the core idea of strategy [from_short_to_long]
-            mixed_pred = FalconTST_forecast
-            output_list = []
-            cur_pred = None
-            cur_pred_len = 0
-            # from the first(shortest) as begining
-            for idx, head_pred_len in enumerate(self.config.multi_forecast_head_list):
-                # assert cur_pred_len <= head_pred_len, \
-                # "Accumulated prediction length exceeds the prediction length of current forecast head"
-                ar_step = multi_forecast_head_dict[head_pred_len]
-                if ar_step == 0:
-                    # Ignore the current forecast head
-                    continue
-                # Add current head's first auto-regressive step of prediction
-                head_pred = mixed_pred[:, :head_pred_len]     # [single]
-                output_list.append(head_pred[:, cur_pred_len:])
-                cur_pred = torch.cat(output_list, dim=1)
-                cur_pred_len = cur_pred.shape[1]
-                if cur_pred_len >= self.config.inference_length:
-                    break
-                # Do auto-regressive of the rest of the steps
-                for _ in range(1, ar_step + 1):
-                    # one-step model prediction
-                    cur_input = torch.cat([input, cur_pred], dim=1)[:, -self.seq_length:].contiguous()
-                    cur_input_mask = torch.cat(
-                        [input_mask,
-                        torch.ones(cur_pred.shape, dtype=input_mask.dtype, device=input_mask.device)],
-                        dim=1)[:, -self.seq_length:].contiguous()   # 0:mask, 1:unmask
-                    FalconTST_forecast = self._inference_step(
-                        input=cur_input,
-                        input_mask=cur_input_mask,
-                        rotary_pos_emb=rotary_pos_emb,
-                    )
-                    head_pred = FalconTST_forecast[:, :head_pred_len]
-                    output_list.append(head_pred)
-                    cur_pred = torch.cat(output_list, dim=1)
-                    cur_pred_len = cur_pred.shape[1]
-                    if cur_pred_len >= self.config.inference_length:
-                        break
-                if cur_pred_len >= self.config.inference_length:
-                    break
-            final_output = cur_pred[:, :self.config.inference_length] # [batch_size, inference_len]
         assert final_output.shape[1] == self.config.inference_length
         return final_output
-class FalconTSTForPrediction(FalconTSTPreTrainedModel, FalconTSTGenerationMixin):
     def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
         self.model = FalconTSTModel(self.config)
         self.post_init()
-    def forward(
         self,
-        input_ids: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.FloatTensor] = None,
-        return_dict: Optional[bool] = False,
-        max_output_length: Optional[int] = None,
-        revin: Optional[bool] = False,
-    ):
-        self.model.config.inference_length = max_output_length
-        outputs = self.model(
-            input=input_ids,
-            revin=revin
-        )
-        loss = None
-        logits = outputs
-        if labels is not None:
-            loss_fn = nn.MSELoss()
-            loss = loss_fn(logits, labels)
-        if not return_dict:
-            output = (logits,)
-            return ((loss,) + output) if loss is not None else output
-        return logits
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        revin=False,
-        **kwargs
-    ):
-        """
-        Prepare model inputs for autoregressive generation.
         """
-        model_inputs = {"input_ids": input_ids}
-        model_inputs.update({
-            "revin": revin,
-        })
-        return model_inputs

 from functools import reduce
 from abc import ABC, abstractmethod
 from .configuration_FalconTST import FalconTSTConfig
 from transformers import PreTrainedModel, Cache, DynamicCache
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
     return torch.cat((t, t_pass), dim=-1)
 class RotaryEmbedding(nn.Module):
     """Rotary Embedding.
         dim = kv_channels
         self.rotary_interleaved = rotary_interleaved
+        if use_cpu_initialization or not torch.cuda.is_available():
+            device = 'cpu'
+        else:
+            device = torch.cuda.current_device()
         self.inv_freq = 1.0 / (
             rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
         )
         return x
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-5):
         super().__init__()
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
+    def forward(self, q, k, v, attention_mask):
         """Implements the multihead softmax attention.
         Arguments
         ---------
+            q,k,v: The tensor containing the query, key, and value.  [seq_len, batch_size, hidden_size]
+            attention_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. [batch_size, 1, seq_len, seq_len]
         """
         q = q.transpose(0,1).contiguous()
         k = k.transpose(0,1).contiguous()
         v = v.transpose(0,1).contiguous()
         batch_size, seq_len = q.shape[0], q.shape[1]
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # scores
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         scores = scores.masked_fill(attention_mask == 0, float('-1e9'))
         # Softmax
         # Dropout
         attention_drop = self.drop(attention)
         output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        output = output.reshape(batch_size, seq_len, -1)
+        output = output.transpose(0,1).contiguous()
+        return output
 class SelfAttention(nn.Module):
     def __init__(self,config,):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.core_attention = TEDotProductAttention()
         self.linear_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.add_bias_linear,)
         self.linear_qkv =  nn.Linear(self.hidden_size, 3*self.hidden_size, bias=config.add_bias_linear,)
+    def forward(self, x, attention_mask, rotary_pos_emb):
+        '''
+            x: [seq_len, batch_size, hidden_size]
+            attention_mask: [batch_size, 1, seq_len, seq_len]
+            rotary_pos_emb: [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        '''
         qkv = self.linear_qkv(x)
+        qkv = qkv.view(qkv.size(0), qkv.size(1), self.config.num_attention_heads, -1)
         q, k, v = qkv.chunk(3, dim=-1)
         # Apply rotary encoding to q and k
         rotary_pos_emb = (rotary_pos_emb,) * 2
         q_pos_emb, k_pos_emb = rotary_pos_emb
         q = _apply_rotary_pos_emb_bshd(q, q_pos_emb)
         k = _apply_rotary_pos_emb_bshd(k, k_pos_emb)
         # attention
         attn_output = self.core_attention(q, k, v, attention_mask)
         output = self.linear_proj(attn_output)
 class MLP(nn.Module):
+    def __init__(self,config, in_features):
         super().__init__()
         self.config= config
         self.linear_fc1 = nn.Linear(in_features, self.config.moe_ffn_hidden_size*2, bias=self.config.add_bias_linear,)
             self.input_layernorm = IdentityOp()
         self.self_attention = SelfAttention(config)
         self.pre_mlp_layernorm = RMSNorm(self.config.hidden_size)
+        self.mlp = MLP(config, self.config.hidden_size)
+    def forward(self, x, attention_mask, rotary_pos_emb):
+        '''
+            x: [seq_len, batch_size, hidden_size]
+            attention_mask: [batch_size, 1, seq_len, seq_len]
+            rotary_pos_emb: [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        '''
         residual = x
         x = self.input_layernorm(x)
         x = self.self_attention(x, attention_mask, rotary_pos_emb)
         # Patchify the input
         input_data = input_data.unfold(dimension=-1, size=self.patch_size, step=self.patch_size).contiguous() # input [batch_size, patch_num, patch_size]
+        hidden_states= self.patch_embedding(input_data)                 # hidden_states [batch_size, patch_num, hidden_size]
         hidden_states = hidden_states.transpose(0, 1).contiguous()      # hidden_states [patch_num, batch_size, hidden_size], To adapt to the Megatron
         # Patchify the mask: only the entire time points in a patch are masked then this patch is masked
         attention_mask = attention_mask.unsqueeze(2).repeat(1,1,patch_num) * attention_mask.unsqueeze(1).repeat(1,patch_num,1)  # [batch_size, patch_num, patch_num]
         attention_mask = attention_mask.unsqueeze(1).contiguous()   # [batch_size, 1, patch_num, patch_num]
         return hidden_states, attention_mask, input_mask
+    def _forward_output(self, hidden_states, output_scale=None, input_mask=None):
         """
             Perform a forward pass through the output layer.
             Args:
                 hidden_states (Tensor): Transformed hidden states of shape [patch_num, batch_size, hidden_size]
                 output_scale (Tensor, optional): Expert probabilities for the output layer  [batch_size]
                 input_mask (Tensor, optional): Expert input mask of shape [batch_size, seq_len], 0:mask, 1:unmask
         return expert_output
+    def forward(self, expert_input, rotary_pos_emb, expert_probs=None):
         hidden_states, attention_mask, input_mask = self._forward_patch_embedding(expert_input)
+        # hidden_states:  [patch_num, batch_size, hidden_size]
+        # attention_mask: [batch_size, 1, patch_num, patch_num]
+        # input_mask:     [batch_size, seq_len]
         for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask, rotary_pos_emb[:hidden_states.shape[0]])
         hidden_states = self.final_layernorm(hidden_states)
         expert_output = self._forward_output(hidden_states, expert_probs, input_mask)
         return expert_output
         return expert_output
+class TopKRouter(nn.Module):
+    def __init__(self, config: FalconTSTConfig):
         super().__init__()
         self.config = config
+        self.topk = config.moe_router_topk
+        self.weight = nn.Parameter(
+            torch.empty((config.num_moe_experts, config.moe_router_input_size), dtype=torch.float32)
+        )
         self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.normal_(self.weight, mean=0, std=self.config.init_method_std)
     def routing(self, logits: torch.Tensor):
+        score_function = self.config.moe_router_score_function
+        if score_function == "softmax":
+            if self.config.moe_router_pre_softmax:
+                scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+                probs, top_indices = torch.topk(scores, self.topk, dim=1)
+            else:
+                scores, top_indices = torch.topk(logits, self.topk, dim=1)
+                probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
         else:
+            raise NotImplementedError
+        routing_probs = torch.zeros_like(logits).scatter_(1, top_indices, probs)
+        routing_map = torch.zeros_like(logits, dtype=torch.bool).scatter_(1, top_indices, True)
+        return routing_probs, routing_map
     def forward(self, input: torch.Tensor):
+        if self.weight.device != input.device:
+            self.weight.data = self.weight.data.to(input.device)
+        gating_logits = F.linear(input, self.weight)
+        num_tokens = gating_logits.shape[:-1].numel()
+        gating_logits = gating_logits.view(num_tokens, self.config.num_moe_experts)
+        scores, routing_map = self.routing(gating_logits)
         return scores, routing_map
                 self.expert_output_size = config.seq_length
         if self.is_last_layer and self.config.heterogeneous_moe_layer:
+            # If heterogeneous_moe_layer is True, the backcast will be None
+            self.backcast_layernorm = None
         else:
             self.backcast_layernorm = RMSNorm(self.seq_length)
         # permuted_probs (global_probs):            [num_permuted_samples_after_dispatch_postprocess(sorted)]
         experts_output = self.experts(input, routing_map, rotary_pos_emb, probs)
         return experts_output, shared_experts_output
     def combine(
         self,
         experts_output: torch.Tensor,
         """Combines expert outputs via communication and adds shared expert output.
         This method uses the time series(sample) dispatcher to combine the outputs from different
+        experts. It then adds the output from the shared expert if it exists.
         """
         assert experts_output.shape == shared_experts_output.shape,\
              f'experts_output shape {experts_output.shape} doesn\'t equal to shared_experts_output shape:{shared_experts_output.shape}'
         return output_backcast, output_forecast
+    def postprocess(
+        self,
+        backcast: torch.Tensor,         # [batch_size, seq_len]
+        forecast: torch.Tensor,         # [batch_size, pred_len]
+        output_backcast: torch.Tensor,  # [batch_size, seq_len]
+        output_forecast: torch.Tensor,  # [batch_size, pred_len]
+    ):
+        """
+        Args:
+            backcast (torch.Tensor): The previous layer's backcast time series (samples).                   [batch_size, seq_len]
+            forecast (torch.Tensor): The previous layer's forecast time series (samples).                   [batch_size, pred_len]
+            output_backcast (torch.Tensor): The current layer's output backcast time series (samples).      [batch_size, seq_len]
+            output_forecast (torch.Tensor): The current layer's output forecast time series (samples).      [batch_size, pred_len]
+        """
+        if output_backcast is not None:
+            # 25/8/14 @modified by xiaming replace the revin with layernorm after the moe layer
+            # And if we multiply the output_backcast with the input mask, the performance will be hurted
+            output_backcast = self.backcast_layernorm(output_backcast) # LayerNorm
+            if self.config.residual_backcast:
+                output_backcast = backcast - output_backcast
+            output_backcast[~self.input_mask] = self.config.mask_pad_value   # Important! Recover the mask time point back to mask_pad_value(default:255.)
+        if self.config.do_expert_forecast and forecast is not None: # The first layer's forecast is None
+            output_forecast = forecast + output_forecast
+        return output_backcast, output_forecast
+    def forward(self, backcast, forecast, rotary_pos_emb):
         inputs, probs, residual, routing_map = self.router_and_preprocess(backcast)
         experts_output, shared_experts_output = self.experts_compute(inputs, probs, residual, rotary_pos_emb, routing_map)
         output_backcast, output_forecast = self.combine(experts_output, shared_experts_output)
         return output_backcast, output_forecast
 class FalconTSTBlock(nn.Module):
+    def __init__(self, config, input_layernorm = True):
         super().__init__()
         self.config = config
+        if input_layernorm:
+            self.input_layernorm = RMSNorm(self.config.seq_length)
+        else:
+            self.input_layernorm = IdentityOp()
         self.layers = nn.ModuleList([
+            FalconTSTMoELayer(config, layer_num + 1)
+            for layer_num in range(self.config.num_hidden_layers)
+        ])
+    def forward(self, x, rotary_pos_emb):
         backcast = x
         forecast = None
+        input_mask = (backcast != self.config.mask_pad_value)
+        backcast = self.input_layernorm(backcast * input_mask)
+        backcast[~input_mask] = self.config.mask_pad_value
         for layer in self.layers:
+            backcast, forecast = layer(backcast, forecast, rotary_pos_emb)
         return backcast,forecast
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class FalconTSTModel(FalconTSTPreTrainedModel):
     def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
+        self.seq_length = self.config.seq_length
         self.rotary_pos_emb = RotaryEmbedding(
+            kv_channels=self.config.kv_channels,
+            rotary_base=self.config.rotary_base,
+            use_cpu_initialization=self.config.use_cpu_initialization,
+            rotary_interleaved=self.config.rotary_interleaved
         )
         self.decoder = FalconTSTBlock(
+            config=config,
+            input_layernorm=self.config.block_input_layernorm
+        )
         if self.config.do_expert_forecast and self.config.heterogeneous_moe_layer:
             self.output_layer = IdentityOp()
         else:
+            self.output_layer = nn.Linear(in_features=self.seq_length,
+                                          out_features=self.config.pred_length,
+                                          bias=self.config.add_bias_linear,)
     def revin(
         return input, means, stdev
     def forward(self, input, revin):
         batch_size, input_len = input.shape
         # realize varied input length
         if input_len > self.seq_length:
             input = input[:, -self.seq_length:]
         rotary_pos_emb = self.rotary_pos_emb(input_len, device=input.device)
         # Step3. Do one-step inference to get mixed forecasts from multiple forecast heads
+        # mixed_pred: [batch_size, max(multi_forecast_head)]
         mixed_pred = self._inference_step(
             input=input,
             input_mask=input_mask,
         rotary_pos_emb,
     ):
         if self.config.do_base_forecast:
+            base_forecast, _ = self.base_output_layer(input * input_mask)
         else:
             base_forecast = None
         decoder_backcast, decoder_forecast = self.decoder(
+            input,               # [batch_size, seq_len]
             rotary_pos_emb,      # [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
         )
             if self.config.heterogeneous_moe_layer:
                 decoder_forecast = self.output_layer(decoder_forecast)  # IdentityOp
             else:
+                final_forecast= self.output_layer(decoder_backcast * input_mask)
                 decoder_forecast = decoder_forecast + final_forecast
         else:
             # The decoder_backcast contains the mask_pad_val(default:255.)
             decoder_forecast, _ = self.output_layer(decoder_backcast * input_mask)
         if self.config.do_base_forecast:
             assert base_forecast is not None, f'base_forecast is None'
             FalconTST_forecast = base_forecast + decoder_forecast
             final_output = final_output[:, :self.config.inference_length]
+        else:
+            raise NotImplementedError
         assert final_output.shape[1] == self.config.inference_length
         return final_output
+class FalconTSTForPrediction(FalconTSTPreTrainedModel):
     def __init__(self, config: FalconTSTConfig):
         super().__init__(config)
         self.config = config
         self.model = FalconTSTModel(self.config)
         self.post_init()
+    @torch.no_grad()
+    def predict(
         self,
+        time_series: torch.Tensor,
+        forecast_horizon: int,
+        revin: bool = True,
+    ) -> torch.Tensor:
+        """
+        Generates time series forecasts autoregressively.
+        Args:
+            time_series (torch.Tensor): Input time series data.
+                                        Shape: [batch_size, seq_len] or [batch_size, seq_len, channels].
+            forecast_horizon (int): The number of future time steps to predict.
+        Returns:
+            torch.Tensor: The forecasted time series. Shape: [batch_size, forecast_horizon, channels].
         """
+        self.eval()
+        assert time_series.ndim == 2 or time_series.ndim == 3, "Input shape must be [batch, seq_len, channel] or [batch, seq_len]"
+        is_multichannel = time_series.ndim == 3
+        if is_multichannel:
+            batch_size, seq_len, num_channels = time_series.shape
+            # [B, L, C] -> [B * C, L]
+            input_flat = time_series.permute(0, 2, 1).reshape(batch_size * num_channels, seq_len)
+        else:
+            batch_size, seq_len = time_series.shape
+            num_channels = 1
+            input_flat = time_series
+        self.config.inference_length = forecast_horizon
+        forecast_flat = self.model(
+            input=input_flat,
+            revin=revin
+        ) # Shape: [B * C, H]
+        if is_multichannel:
+            forecast = forecast_flat.reshape(batch_size, num_channels, forecast_horizon)
+            forecast = forecast.permute(0, 2, 1).contiguous()
+        else:
+            forecast = forecast_flat
+        return forecast

ts_generation_mixin.py DELETED Viewed

@@ -1,89 +0,0 @@
-import warnings
-from typing import Any, Dict, List, Optional, Union, Callable
-import torch
-from transformers import GenerationMixin, LogitsProcessorList, StoppingCriteriaList
-from transformers.generation import validate_stopping_criteria, EosTokenCriteria
-from transformers.generation.utils import (
-    GenerateNonBeamOutput,
-    GenerateEncoderDecoderOutput,
-    GenerateDecoderOnlyOutput,
-    GenerationConfig,
-    GenerateOutput,
-)
-from transformers.utils import ModelOutput
-class FalconTSTGenerationMixin(GenerationMixin):
-    @torch.no_grad()
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        negative_prompt_ids: Optional[torch.Tensor] = None,
-        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
-        revin: Optional[bool] = True,
-        num_samples: Optional[int] = 1,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        """
-        FalconTST generate function。
-        """
-        batch_size = inputs.shape[0]
-        length = inputs.shape[1]
-        channel = 1
-        if len(inputs.shape) == 3:
-            channel = inputs.shape[2]
-            inputs = inputs.permute(0, 2, 1).reshape(batch_size * channel, length)
-        elif len(inputs.shape) > 3:
-            raise ValueError("Input shape must be [batch, seq_len, channel] or [batch, seq_len]")
-        outputs = super().generate(
-            inputs=inputs,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            assistant_model=assistant_model,
-            streamer=streamer,
-            negative_prompt_ids=negative_prompt_ids,
-            negative_prompt_attention_mask=negative_prompt_attention_mask,
-            revin=revin,
-            **kwargs,
-        )
-        pred_len = outputs.shape[1]
-        outputs = outputs.reshape(batch_size, channel, pred_len)
-        outputs = outputs.transpose(1, 2).contiguous()
-        return outputs
-    def _greedy_search(
-        self,
-        input_ids: torch.Tensor,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        max_length: Optional[int] = None,
-        pad_token_id: Optional[int] = None,
-        eos_token_id: Optional[Union[int, List[int]]] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        output_scores: Optional[bool] = None,
-        output_logits: Optional[bool] = None,
-        return_dict_in_generate: Optional[bool] = None,
-        synced_gpus: bool = False,
-        streamer: Optional["BaseStreamer"] = None,
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.Tensor]:
-        input_ids = input_ids.to(self.device)
-        batch_size, cur_len = input_ids.shape
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-        # stopping_criteria.max_length = input_len + pred_len
-        outputs = self(**model_inputs, return_dict=True, max_output_length=stopping_criteria.max_length-cur_len)
-        return outputs