First model version

Browse files

Files changed (8) hide show

config.json +58 -0
configuration_patch_moe.py +203 -0
generation_config.json +4 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +292 -0
modeling_patch_moe.py +1326 -0
ts_generation_mixin.py +172 -0

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "_name_or_path": "patchmoe",
+  "architectures": [
+    "PatchMoEForPrediction"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_patch_moe.PatchMoeConfig",
+    "AutoModelForCausalLM": "modeling_patch_moe.PatchMoEForPrediction"
+  },
+  "disable_bias_linear": false,
+  "do_base_forecast": false,
+  "do_expert_forecast": true,
+  "expert_num_layers": 4,
+  "ffn_hidden_size": 4096,
+  "heterogeneous_moe_layer": false,
+  "hidden_size": 1024,
+  "init_method_std": 0.06,
+  "is_revin": true,
+  "k_layernorm": false,
+  "kv_channels": 64,
+  "mask_pad_value": 255.0,
+  "model_type": "patch_moe",
+  "moe_expert_final_layernorm": true,
+  "moe_ffn_hidden_size": 4096,
+  "moe_router_enable_expert_bias": false,
+  "moe_router_input_size": 2880,
+  "moe_router_pre_softmax": true,
+  "moe_router_score_function": "softmax",
+  "moe_router_topk": 1,
+  "moe_shared_expert_intermediate_size": 4096,
+  "multi_forecast_head_list": [
+    24,
+    96,
+    336
+  ],
+  "num_attention_heads": 16,
+  "num_hidden_layers": 2,
+  "num_moe_experts": 4,
+  "torch_dtype": "bfloat16",
+  "patch_size_list": [
+    120,
+    96,
+    64,
+    36
+  ],
+  "pred_length": 336,
+  "q_layernorm": false,
+  "residual_backcast": true,
+  "rotary_base": 1000000,
+  "rotary_interleaved": false,
+  "seq_length": 2880,
+  "shared_patch_size": 32,
+  "tie_word_embeddings": false,
+  "transformer_input_layernorm": true,
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_cpu_initialization": true
+}

configuration_patch_moe.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Configuration class for PatchMoE model.
+This module defines the configuration for PatchMoE, a large-scale time series foundation model
+that utilizes Mixture of Experts (MoE) architecture with multiple patch tokenizers.
+"""
+from typing import List, Optional
+from transformers import PretrainedConfig
+class PatchMoeConfig(PretrainedConfig):
+    """
+    Configuration class for PatchMoE model.
+    PatchMoE is a time series foundation model that uses Mixture of Experts architecture
+    with multiple patch tokenizers for efficient time series forecasting.
+    This configuration inherits from [`PretrainedConfig`] and can be used to control the model
+    output. Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        ffn_hidden_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the feed-forward networks in the transformer layers.
+        seq_length (`int`, *optional*, defaults to 2880):
+            Maximum sequence length that the model can handle.
+        add_bias_linear (`bool`, *optional*, defaults to `False`):
+            Whether to add bias in linear layers.
+        rope_theta (`int`, *optional*, defaults to 10000):
+            The base period of the RoPE embeddings.
+        num_hidden_layers (`int`, *optional*, defaults to 3):
+            Number of hidden layers in the transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the transformer encoder.
+        mask_pad_value (`float`, *optional*, defaults to 255.0):
+            Value used for padding/masking in input sequences.
+        expert_num_layers (`int`, *optional*, defaults to 4):
+            Number of transformer layers within each expert.
+        shared_patch_size (`int`, *optional*, defaults to 64):
+            Size of patches for the shared expert.
+        patch_size_list (`List[int]`, *optional*, defaults to [96, 64, 48, 24]):
+            List of patch sizes for different experts.
+        multi_forecast_head_list (`List[int]`, *optional*, defaults to [24, 96, 336]):
+            List of forecast lengths for multi-head prediction.
+        is_revin (`bool`, *optional*, defaults to `True`):
+            Whether to use RevIN (Reversible Instance Normalization).
+        params_dtype (`str`, *optional*, defaults to "bfloat16"):
+            Data type for model parameters.
+        use_cpu_initialization (`bool`, *optional*, defaults to `False`):
+            Whether to initialize model parameters on CPU.
+        rotary_interleaved (`bool`, *optional*, defaults to `False`):
+            Whether to use interleaved rotary position embeddings.
+        do_expert_forecast (`bool`, *optional*, defaults to `True`):
+            Whether experts perform forecasting.
+        residual_backcast (`bool`, *optional*, defaults to `True`):
+            Whether to use residual connections for backcast.
+        do_base_forecast (`bool`, *optional*, defaults to `False`):
+            Whether to use base forecasting.
+        heterogeneous_moe_layer (`bool`, *optional*, defaults to `True`):
+            Whether to use heterogeneous MoE layers.
+        test_data_seq_len (`int`, *optional*, defaults to 2880):
+            Sequence length for test data.
+        test_data_test_len (`int`, *optional*, defaults to 720):
+            Test length for test data.
+        autoregressive_step_list (`List[int]`, *optional*, defaults to [2, 4, 1]):
+            List of autoregressive steps for different forecast heads.
+        multi_forecast_head_type (`str`, *optional*, defaults to "single"):
+            Type of multi-forecast head aggregation.
+        num_experts (`int`, *optional*, defaults to 4):
+            Number of experts in the MoE layer.
+        moe_router_topk (`int`, *optional*, defaults to 2):
+            Number of top experts to route each token to.
+        moe_ffn_hidden_size (`int`, *optional*, defaults to 4096):
+            Hidden size for MoE feed-forward networks.
+        moe_shared_expert_intermediate_size (`int`, *optional*, defaults to 4096):
+            Intermediate size for shared experts.
+        init_method_std (`float`, *optional*, defaults to 0.06):
+            Standard deviation for weight initialization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Range for weight initialization.
+        moe_router_enable_expert_bias (`bool`, *optional*, defaults to `False`):
+            Whether to enable expert bias in routing.
+        moe_expert_final_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization at the end of each expert.
+        transformer_input_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to transformer inputs.
+        moe_router_pre_softmax (`bool`, *optional*, defaults to `True`):
+            Whether to apply softmax before routing.
+        q_layernorm (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization to query vectors.
+        k_layernorm (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization to key vectors.
+        moe_router_score_function (`str`, *optional*, defaults to "softmax"):
+            Score function for MoE routing ("softmax" or "sigmoid").
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie word embeddings.
+    """
+    model_type = "patch_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        ffn_hidden_size: int = 4096,
+        seq_length: int = 2880,
+        add_bias_linear: bool = False,
+        rope_theta: int = 10000,
+        num_hidden_layers: int = 3,
+        num_attention_heads: int = 16,
+        mask_pad_value: float = 255.0,
+        expert_num_layers: int = 4,
+        shared_patch_size: int = 64,
+        patch_size_list: Optional[List[int]] = None,
+        multi_forecast_head_list: Optional[List[int]] = None,
+        is_revin: bool = True,
+        use_cpu_initialization: bool = False,
+        rotary_interleaved: bool = False,
+        do_expert_forecast: bool = True,
+        residual_backcast: bool = True,
+        do_base_forecast: bool = False,
+        heterogeneous_moe_layer: bool = True,
+        test_data_seq_len: int = 2880,
+        test_data_test_len: int = 720,
+        autoregressive_step_list: Optional[List[int]] = None,
+        multi_forecast_head_type: str = "single",
+        num_experts: int = 4,
+        moe_router_topk: int = 2,
+        moe_ffn_hidden_size: int = 4096,
+        moe_shared_expert_intermediate_size: int = 4096,
+        init_method_std: float = 0.06,
+        initializer_range: float = 0.02,
+        moe_router_enable_expert_bias: bool = False,
+        moe_expert_final_layernorm: bool = True,
+        transformer_input_layernorm: bool = True,
+        moe_router_pre_softmax: bool = True,
+        q_layernorm: bool = False,
+        k_layernorm: bool = False,
+        moe_router_score_function: str = "softmax",
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        """Initialize PatchMoE configuration."""
+        # Set default values for list parameters
+        if patch_size_list is None:
+            patch_size_list = [96, 64, 48, 24]
+        if multi_forecast_head_list is None:
+            multi_forecast_head_list = [24, 96, 336]
+        if autoregressive_step_list is None:
+            autoregressive_step_list = [2, 4, 1]
+        # patchmoe inference specific
+        self.test_data_seq_len = test_data_seq_len
+        self.inference_length = test_data_test_len
+        self.autoregressive_step_list = autoregressive_step_list
+        self.multi_forecast_head_type = multi_forecast_head_type
+        self.use_cache = True
+        # patchmoe specific
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.init_method_std = init_method_std
+        self.initializer_range = initializer_range
+        self.seq_length = seq_length
+        self.multi_forecast_head_list = multi_forecast_head_list
+        self.kv_channels = self.hidden_size // self.num_attention_heads
+        self.rotary_base = rope_theta
+        self.num_hidden_layers = num_hidden_layers
+        self.mask_pad_value = mask_pad_value
+        self.pred_length = max(self.multi_forecast_head_list)
+        self.add_bias_linear = add_bias_linear
+        self.is_revin = is_revin
+        self.do_base_forecast = do_base_forecast
+        self.do_expert_forecast = do_expert_forecast
+        self.residual_backcast = residual_backcast
+        self.heterogeneous_moe_layer = heterogeneous_moe_layer
+        self.use_cpu_initialization = use_cpu_initialization
+        self.rotary_interleaved = rotary_interleaved
+        # expert specific
+        self.patch_size_list = patch_size_list
+        self.num_moe_experts = num_experts
+        self.shared_patch_size = shared_patch_size
+        self.expert_num_layers = expert_num_layers
+        self.moe_router_input_size = self.seq_length
+        self.moe_router_topk = moe_router_topk
+        self.moe_router_score_function = moe_router_score_function
+        self.moe_ffn_hidden_size = moe_ffn_hidden_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.moe_expert_final_layernorm = moe_expert_final_layernorm
+        self.transformer_input_layernorm = transformer_input_layernorm
+        self.moe_router_pre_softmax = moe_router_pre_softmax
+        self.q_layernorm = q_layernorm
+        self.k_layernorm = k_layernorm
+        kwargs.pop("tie_word_embeddings", None)
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.40.1"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8534fa131034e86c50ec43cc14e3d6f17af1d5d4161a11ada2218d12067e1c4c
+size 3718382544

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a15d1fcb6388aed06deb70f77918cd38899476dc0c4b1ac7dc57391cf8a477
+size 1264771376

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,292 @@

+{
+    "metadata": {
+        "total_size": 4983109888
+    },
+    "weight_map": {
+        "model.decoder.layers.0.router.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.experts.local_experts.3.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.0.shared_experts.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.router.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.backcast_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.0.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.1.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.2.output_layer.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_proj.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.self_attention.linear_qkv.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.pre_mlp_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.layers.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.final_layernorm.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc1.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.patch_embedding.linear_fc2.weight": "model-00001-of-00002.safetensors",
+        "model.decoder.layers.1.experts.local_experts.3.output_layer.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.input_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.0.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.input_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.1.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.input_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.2.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.input_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_proj.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.self_attention.linear_qkv.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.pre_mlp_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc1.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.layers.3.mlp.linear_fc2.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.final_layernorm.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc1.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.patch_embedding.linear_fc2.weight": "model-00002-of-00002.safetensors",
+        "model.decoder.layers.1.shared_experts.output_layer.weight": "model-00002-of-00002.safetensors",
+        "model.output_layer.weight": "model-00002-of-00002.safetensors"
+    }
+}

modeling_patch_moe.py ADDED Viewed

	@@ -0,0 +1,1326 @@

+import torch
+from typing import Optional
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import math
+from functools import reduce
+from abc import ABC, abstractmethod
+from .configuration_patch_moe import PatchMoeConfig
+from .ts_generation_mixin import PatchMoEGenerationMixin
+from transformers import PreTrainedModel
+def _rotate_half(x: Tensor, rotary_interleaved: bool) -> Tensor:
+    """Change sign so the last dimension becomes [-odd, +even]
+    Args:
+        x (Tensor): Input tensor
+    Returns:
+        Tensor: Tensor rotated half
+    """
+    if not rotary_interleaved:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1 = x[:, :, :, ::2]
+        x2 = x[:, :, :, 1::2]
+        x_new = torch.stack((-x2, x1), dim=-1)
+        return x_new.view(x_new.shape[0], x_new.shape[1], x_new.shape[2], -1)
+def _apply_rotary_pos_emb_bshd(
+    t: Tensor,
+    freqs: Tensor,
+    rotary_interleaved: bool = False,
+    multi_latent_attention: bool = False,
+    mscale: float = 1.0,
+) -> Tensor:
+    """Apply rotary positional embedding to input tensor T.
+    check https://kexue.fm/archives/8265 for detailed formulas
+    Args:
+        t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
+        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+    Returns:
+        Tensor: The input tensor after applying RoPE
+    """
+    freqs = freqs.to(t.device)
+    rot_dim = freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+    if multi_latent_attention:
+        x1 = t[..., 0::2]
+        x2 = t[..., 1::2]
+        t = torch.cat((x1, x2), dim=-1)
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    cos_ = (torch.cos(freqs) * mscale).to(t.dtype)
+    sin_ = (torch.sin(freqs) * mscale).to(t.dtype)
+    t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+    return torch.cat((t, t_pass), dim=-1)
+def topk_softmax_with_capacity(
+    logits: torch.Tensor,
+    topk: int,
+    use_pre_softmax: bool = False,
+    score_function: str = "softmax",
+    expert_bias: Optional[torch.Tensor] = None,
+):
+    """Apply capacity and padding to the top-k selection.
+    Args:
+        logits (torch.Tensor): Logits tensor.
+        topk (int): The number of experts to select for each token.
+        use_pre_softmax (bool): Whether to apply softmax or sigmoid before top-k selection.
+        score_function (str): The score function to use. Can be either "softmax" or "sigmoid".
+        expert_bias (torch.Tensor): The bias added to logits for expert routing.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
+              the routing probabilities for each token to each expert.
+            - routing_map (torch.Tensor): A mask tensor of shape [num_tokens, num_experts]
+              indicating which experts were selected for each token. True values represent
+              the selected experts.
+            - tokens_per_expert (torch.Tensor): A tensor of shape [num_experts] containing
+              the number of local tokens assigned to each expert before dropping and padding.
+    """
+    assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
+    def compute_topk(
+        scores,
+        topk,
+    ):
+        return torch.topk(scores, k=topk, dim=1)
+    if score_function == "softmax":
+        if use_pre_softmax:
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+            probs, top_indices = compute_topk(
+                scores,
+                topk,
+            )
+        else:
+            scores, top_indices = compute_topk(
+                logits,
+                topk,
+            )
+            probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
+    elif score_function == "sigmoid":
+        scores = torch.sigmoid(logits.float()).type_as(logits)
+        if expert_bias is not None:
+            scores_for_routing = scores + expert_bias
+            _, top_indices = compute_topk(
+                scores_for_routing,
+                topk,
+            )
+            scores = torch.gather(scores, dim=1, index=top_indices).type_as(logits)
+        else:
+            scores, top_indices = compute_topk(
+                scores,
+                topk,
+            )
+        probs = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if topk > 1 else scores
+    else:
+        raise ValueError(f"Invalid score_function: {score_function}")
+    # TODO Try using element-wise operations instead of scatter?
+    topk_masked_gates = torch.zeros_like(logits).scatter(1, top_indices, probs)
+    topk_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
+    # TODO: Reset topk_map to realize load-balancing?
+    tokens_per_expert = topk_map.sum(dim=0)
+    return topk_masked_gates, topk_map, tokens_per_expert
+class RotaryEmbedding(nn.Module):
+    """Rotary Embedding.
+    Args:
+        kv_channels (int): Projection weights dimension in multi-head attention. Obtained
+            from transformer config
+        rotary_interleaved (bool, optional): If True, interleaved rotary position embeddings.
+            Defaults to False.
+        rotary_base (int, optional): Base period for rotary position embeddings. Defaults to
+            10000.
+        use_cpu_initialization (bool, optional): If False, initialize the inv_freq directly
+            on the GPU. Defaults to False
+    """
+    def __init__(
+        self,
+        kv_channels: int,
+        rotary_interleaved: bool = False,
+        rotary_base: int = 10000,
+        use_cpu_initialization: bool = False,
+    ) -> None:
+        super().__init__()
+        dim = kv_channels
+        self.rotary_interleaved = rotary_interleaved
+        device = "cpu" if use_cpu_initialization else torch.cuda.current_device()
+        self.inv_freq = 1.0 / (
+            rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+    def get_freqs_non_repeated(self, max_seq_len: int, offset: int = 0) -> Tensor:
+        """Generates matrix of frequencies based on positions in the sequence,
+        used to create positional encodings"""
+        seq = (
+            torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+            + offset
+        )
+        freqs = torch.outer(seq, self.inv_freq)  # [seq len, dim]
+        return freqs
+    def forward(
+        self, max_seq_len: int, offset: int = 0, packed_seq: bool = False, device=None
+    ) -> Tensor:
+        """Forward pass of RoPE embedding.
+        Args:
+            max_seq_len (int): Maximum size of sequence
+            offset (int, optional): RoPE offset. Defaults to 0.
+            packed_seq (bool, optional): Whether to use packed sequence. Defaults to False.
+        Returns:
+            Tensor: Embeddings after applying RoPE.
+        """
+        if device is None:
+            device = self.inv_freq.device
+        if self.inv_freq.device.type == "cpu":
+            # move `inv_freq` to GPU once at the first micro-batch forward pass
+            self.inv_freq = self.inv_freq.to(device=device)
+        freqs = self.get_freqs_non_repeated(max_seq_len, offset).to(device)
+        # first part even vector components, second part odd vector components,
+        #  2 * dim in dimension size
+        if not self.rotary_interleaved:
+            emb = torch.cat((freqs, freqs), dim=-1)
+        else:
+            emb = torch.stack((freqs.view(-1, 1), freqs.view(-1, 1)), dim=-1).view(
+                freqs.shape[0], -1
+            )
+        # emb [seq_length, .., dim]
+        emb = emb[:, None, None, :]
+        return emb.to(device)
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        state_dict.pop(f"{prefix}inv_freq", None)
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    def get_rotary_seq_len(
+        self,
+        transformer_input: Tensor,
+    ) -> float:
+        """Function to get the rotary sequence length.
+        Args:
+            transformer_input (Tensor): Input tensor to the transformer
+        Returns:
+            float: The rotary sequence length
+        """
+        rotary_seq_len = transformer_input.size(0)
+        return rotary_seq_len
+class IdentityOp(nn.Module):
+    def forward(self, x):
+        return x
+class IdentityFuncOp(nn.Module):
+    def forward(self, x):
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        """
+        hidden_states [bs, patch_num, d_model]
+        """
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class TEDotProductAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        attention_mask,
+        causal=None,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
+            causal: if passed, will override self.causal
+            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. (B, S)
+        """
+        causal = self.causal if causal is None else causal
+        q = q.transpose(0, 1).contiguous()
+        k = k.transpose(0, 1).contiguous()
+        v = v.transpose(0, 1).contiguous()
+        batch_size, seq_len = q.shape[0], q.shape[1]
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # scores
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        scores = scores.masked_fill(attention_mask == 0, float("-1e9"))
+        # Softmax
+        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
+        # Dropout
+        attention_drop = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        output = output.reshape(batch_size, seq_len, -1).transpose(0, 1).contiguous()
+        return output
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__()
+        self.config = config
+        q_layernorm = config.q_layernorm
+        k_layernorm = config.k_layernorm
+        self.hidden_size = config.hidden_size
+        self.core_attention = TEDotProductAttention()
+        self.linear_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.add_bias_linear,
+        )
+        self.linear_qkv = nn.Linear(
+            self.hidden_size,
+            3 * self.hidden_size,
+            bias=config.add_bias_linear,
+        )
+        if q_layernorm:
+            self.q_layernorm = RMSNorm(self.hidden_size)
+        else:
+            self.q_layernorm = IdentityOp()
+        if k_layernorm:
+            self.k_layernorm = RMSNorm(self.hidden_size)
+        else:
+            self.k_layernorm = IdentityOp()
+    def forward(self, x, attention_mask, rotary_pos_emb):
+        qkv = self.linear_qkv(x)
+        qkv = qkv.view(qkv.size(0), qkv.size(1), self.config.num_attention_heads, -1)
+        q, k, v = qkv.chunk(3, dim=-1)
+        # q/k norm
+        q = self.q_layernorm(q)
+        k = self.k_layernorm(k)
+        # Apply rotary encoding to q and k
+        rotary_pos_emb = (rotary_pos_emb,) * 2
+        q_pos_emb, k_pos_emb = rotary_pos_emb
+        q = _apply_rotary_pos_emb_bshd(q, q_pos_emb)
+        k = _apply_rotary_pos_emb_bshd(k, k_pos_emb)
+        # attention
+        attn_output = self.core_attention(q, k, v, attention_mask)
+        output = self.linear_proj(attn_output)
+        return output
+class MLP(nn.Module):
+    def __init__(self, config, in_features):
+        super().__init__()
+        self.config = config
+        self.linear_fc1 = nn.Linear(
+            in_features,
+            self.config.moe_ffn_hidden_size * 2,
+            bias=self.config.add_bias_linear,
+        )
+        self.linear_fc2 = nn.Linear(
+            self.config.moe_ffn_hidden_size,
+            self.config.hidden_size,
+            bias=self.config.add_bias_linear,
+        )
+    def forward(self, x):
+        x = self.swiglu(self.linear_fc1(x))
+        x = self.linear_fc2(x)
+        return x
+    def swiglu(self, y):
+        """Performs SwiGLU (Swish-Gated Linear Unit) activation function.
+        Args:
+            y (torch.Tensor): Input tensor to be split into two halves along the last dimension.
+        Returns:
+            torch.Tensor: Result of SwiGLU activation: SiLU(y1) * y2, where y1, y2 are the split halves.
+        """
+        y_1, y_2 = torch.chunk(y, 2, -1)
+        return F.silu(y_1) * y_2
+class TransformerLayer(nn.Module):
+    def __init__(self, config, input_layernorm):
+        super().__init__()
+        self.config = config
+        if input_layernorm:
+            self.input_layernorm = RMSNorm(self.config.hidden_size)
+        else:
+            self.input_layernorm = IdentityOp()
+        self.self_attention = SelfAttention(config)
+        self.pre_mlp_layernorm = RMSNorm(self.config.hidden_size)
+        self.mlp = MLP(config, self.config.hidden_size)
+    def forward(self, x, attention_mask, rotary_pos_emb):
+        residual = x
+        x = self.input_layernorm(x)
+        x = self.self_attention(x, attention_mask, rotary_pos_emb)
+        x = x + residual
+        residual = x
+        x = self.pre_mlp_layernorm(x)
+        x = self.mlp(x)
+        x = x + residual
+        return x
+class PatchMoEExpert_v2(nn.Module):
+    def __init__(self, config, patch_input_size=32, expert_output_size=336, final_layernorm=True):
+        super().__init__()
+        self.config = config
+        self.patch_size = patch_input_size
+        self.seq_length = config.seq_length
+        assert (
+            self.seq_length % self.patch_size == 0
+        ), f"invalid patch_size: {self.patch_size} when seq_length={self.seq_length}"
+        self.patch_num = self.seq_length // self.patch_size
+        self.flatten_size = self.patch_num * self.config.hidden_size
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(config, input_layernorm=config.transformer_input_layernorm)
+                for _ in range(self.config.expert_num_layers)
+            ]
+        )
+        if final_layernorm:
+            self.final_layernorm = RMSNorm(self.config.hidden_size)
+        else:
+            self.final_layernorm = IdentityOp()
+        self.patch_embedding = MLP(config, in_features=patch_input_size)
+        self.output_layer = nn.Linear(
+            in_features=self.flatten_size,
+            out_features=expert_output_size,
+            bias=False,
+        )
+    def _forward_patch_embedding(
+        self,
+        input: Tensor,  # [batch_size, seq_len]
+    ):
+        """
+        Perform patch embedding on the input time series.
+        This method applies a linear transformation to the input tensor to
+        convert it into patches and then embeds these patches using a linear layer.
+        """
+        batch_size, seq_len = input.shape
+        assert (
+            seq_len == self.seq_length
+        ), f"Expected sequence length {self.seq_length}, but got {seq_len}"
+        # Create input_mask based on pad_length
+        # When a time point is masked, its value is mask_pad_value(default:255.)
+        input_mask = (
+            input != self.config.mask_pad_value
+        )  # 0: mask, 1: unmask   [batch_size, seq_len]
+        # so whether the masked value 0 has the same effective of attention_mask
+        input_data = input * input_mask  # [batch_size, seq_len]
+        # Patchify the input
+        input_data = input_data.unfold(
+            dimension=-1, size=self.patch_size, step=self.patch_size
+        ).contiguous()  # input [batch_size, patch_num, patch_size]
+        hidden_states = self.patch_embedding(
+            input_data
+        )  # hidden_states [batch_size, patch_num, hidden_size]
+        hidden_states = hidden_states.transpose(
+            0, 1
+        ).contiguous()  # hidden_states [patch_num, batch_size, hidden_size], To adapt to the Megatron
+        # Patchify the mask: only the entire time points in a patch are masked then this patch is masked
+        attention_mask = input_mask.unfold(
+            dimension=-1, size=self.patch_size, step=self.patch_size
+        ).contiguous()  # [batch_size, patch_num, patch_size]
+        attention_mask = (
+            attention_mask.sum(-1) == self.patch_size
+        )  # [batch_size, patch_num]   # 0: mask, 1: unmask
+        attention_mask[:, -1] = True  # The last patch is not masked
+        _, patch_num = attention_mask.shape
+        attention_mask = attention_mask.unsqueeze(2).repeat(
+            1, 1, patch_num
+        ) * attention_mask.unsqueeze(1).repeat(
+            1, patch_num, 1
+        )  # [batch_size, patch_num, patch_num]
+        attention_mask = attention_mask.unsqueeze(
+            1
+        ).contiguous()  # [batch_size, 1, patch_num, patch_num]
+        return hidden_states, attention_mask, input_mask
+    def _forward_output(
+        self, hidden_states, output_scale=None, input_mask=None, inference_context=None
+    ):
+        """
+        Perform a forward pass through the output layer.
+        Args:
+            expert_input (Tensor): Expert input of shape [batch_size, seq_len]
+            hidden_states (Tensor): Transformed hidden states of shape [patch_num, batch_size, hidden_size]
+            output_scale (Tensor, optional): Expert probabilities for the output layer  [batch_size]
+            input_mask (Tensor, optional): Expert input mask of shape [batch_size, seq_len], 0:mask, 1:unmask
+        Returns:
+            expert_output (Tensor): Expert output of shape [batch_size, expert_output_size]
+        """
+        # [patch_num, batch_size, hidden_size] -> [batch_size, flatten_size (patch_num * hidden_size)]
+        patch_num, batch_size, hidden_size = hidden_states.shape
+        assert (
+            patch_num * hidden_size
+        ) == self.flatten_size, f"patch_num ({patch_num}) * hidden_size ({hidden_size}) != flatten_size ({self.flatten_size})"
+        hidden_states = hidden_states.transpose(0, 1).reshape(-1, self.flatten_size).contiguous()
+        expert_output = self.output_layer(hidden_states)  # [batch_size, expert_output_size]
+        if output_scale is not None:
+            original_dtype = expert_output.dtype
+            expert_output = expert_output * output_scale.unsqueeze(-1)
+            expert_output = expert_output.to(original_dtype)
+        return expert_output
+    def forward(self, expert_input, rotary_pos_emb, expert_probs=None):
+        hidden_states, attention_mask, input_mask = self._forward_patch_embedding(expert_input)
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states, attention_mask, rotary_pos_emb[: hidden_states.shape[0]]
+            )
+        hidden_states = self.final_layernorm(hidden_states)
+        expert_output = self._forward_output(hidden_states, expert_probs, input_mask)
+        return expert_output
+class SequentialPatchMoE(nn.Module):
+    def __init__(self, config, expert_output_size=336):
+        super().__init__()
+        self.config = config
+        self.expert_output_size = expert_output_size
+        self.local_experts = nn.ModuleList(
+            [
+                PatchMoEExpert_v2(
+                    config,
+                    expert_output_size=expert_output_size,
+                    patch_input_size=config.patch_size_list[expert_id],
+                    final_layernorm=config.moe_expert_final_layernorm,
+                )
+                for expert_id in range(config.num_moe_experts)
+            ]
+        )
+    def forward(self, input, routing_map, rotary_pos_emb, expert_probs):
+        expert_output_list = []
+        batch_size, seq_len = input.size()
+        for i, expert in enumerate(self.local_experts):
+            token_mask = routing_map[:, i].bool()  # shape (batch,)
+            current_inputs = input[token_mask]  # (num_tokens_for_expert, seq_len)
+            current_probs = expert_probs[token_mask, i]
+            if current_inputs.numel() == 0:
+                expert_output = torch.zeros(
+                    0, self.expert_output_size, device=input.device, dtype=input.dtype
+                )
+            else:
+                expert_output = expert(current_inputs, rotary_pos_emb, current_probs)
+            full_output = torch.zeros(
+                batch_size, self.expert_output_size, device=input.device, dtype=input.dtype
+            )
+            full_output[token_mask] = expert_output
+            expert_output_list.append(full_output)
+        expert_output = reduce(torch.add, expert_output_list)
+        return expert_output
+class RouterGatingLinearFunction(torch.autograd.Function):
+    """
+    Autograd function for router gating linear.
+    """
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, weight: torch.Tensor, router_dtype: torch.dtype):
+        """
+        Forward pass of the RouterGatingLinearFunction function.
+        """
+        ctx.router_dtype = router_dtype
+        ctx.input_dtype = inp.dtype
+        ctx.weight_dtype = weight.dtype
+        inp_shape = inp.shape
+        inp = inp.view(-1, inp_shape[-1])
+        output = torch.mm(inp.to(router_dtype), weight.to(router_dtype).t())
+        output = output.view(*inp_shape[:-1], -1)
+        return output
+def router_gating_linear(inp: torch.Tensor, weight: torch.Tensor, router_dtype: torch.dtype):
+    """
+    Customized linear layer for router gating.
+    This linear layer accepts bfloat16 input and weight, and can return output with router_dtype.
+    It can reduce the memory usage by avoiding saving the intermediate high precision tensors.
+    """
+    return RouterGatingLinearFunction.apply(inp, weight, router_dtype)
+class Router(ABC, nn.Module):
+    """Base Router class"""
+    def __init__(
+        self,
+        config: PatchMoeConfig,
+    ) -> None:
+        """
+        Initialize the Router module.
+        Args:
+            config (TransformerConfig): Configuration object for the Transformer model.
+            model_comm_pgs (ModelCommProcessGroups, optional): Process groups for MoE operations.
+        """
+        super().__init__()
+        self.config = config
+        # Initialize the gate weights.
+        if self.config.patch_size_list is not None:
+            assert self.config.moe_router_input_size is not None
+            self.weight = torch.nn.Parameter(
+                torch.empty(
+                    (self.config.num_moe_experts, self.config.moe_router_input_size),
+                    dtype=torch.float32,
+                )
+            )
+        else:
+            self.weight = torch.nn.Parameter(
+                torch.empty(
+                    (self.config.num_moe_experts, self.config.hidden_size), dtype=torch.float32
+                )
+            )
+        self.reset_parameters()
+    def reset_parameters(self):
+        """Reset the router parameters."""
+        torch.nn.init.normal_(self.weight, mean=0, std=self.config.init_method_std)
+        self.weight.data = self.weight.data.to(dtype=self.config.torch_dtype)
+    def gating(self, input: torch.Tensor):
+        """Forward pass of the router gate.
+        Args:
+            input (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Logits tensor.
+        """
+        if self.weight.device != input.device:
+            self.weight = self.weight.to(input.device)
+        router_dtype = input.dtype
+        logits = router_gating_linear(input, self.weight, router_dtype)
+        return logits
+    @abstractmethod
+    def routing(self, logits: torch.Tensor):
+        """Routing function.
+        Args:
+            logits (torch.Tensor): Logits tensor.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing token assignment
+            probabilities and mapping.
+        """
+        raise NotImplementedError("Routing function not implemented.")
+    @abstractmethod
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        raise NotImplementedError("Forward function not implemented.")
+class TopKRouter(Router):
+    """Route each token to the top-k experts."""
+    def __init__(
+        self,
+        config: PatchMoeConfig,
+    ) -> None:
+        """Initialize the zero token dropping router.
+        Args:
+            config (TransformerConfig): The configuration for the transformer model.
+            model_comm_pgs (ModelCommProcessGroups, optional): Process groups for MoE operations.
+        """
+        super().__init__(config=config)
+        self.topk = self.config.moe_router_topk
+        self.score_function = self.config.moe_router_score_function
+        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
+        if self.enable_expert_bias:
+            self.register_buffer(
+                "local_tokens_per_expert",
+                torch.zeros(self.config.num_moe_experts, dtype=torch.float32),
+                persistent=False,
+            )
+            self.register_buffer(
+                "expert_bias", torch.zeros(self.config.num_moe_experts, dtype=torch.float32)
+            )
+        else:
+            self.local_tokens_per_expert = None
+            self.expert_bias = None
+    def routing(self, logits: torch.Tensor):
+        """Top-k routing function
+        Args:
+            logits (torch.Tensor): Logits tensor after gating.
+        Returns:
+            probs (torch.Tensor): The probabilities of token to experts assignment.
+            routing_map (torch.Tensor): The mapping of token to experts assignment,
+                with shape [num_tokens, num_experts].
+        """
+        logits = logits.view(-1, self.config.num_moe_experts)
+        scores, routing_map, tokens_per_expert = topk_softmax_with_capacity(
+            logits,
+            self.topk,
+            use_pre_softmax=self.config.moe_router_pre_softmax,
+            score_function=self.score_function,
+            expert_bias=self.expert_bias,
+        )
+        return scores, routing_map
+    def forward(self, input: torch.Tensor):
+        """
+        Forward pass of the router.
+        Args:
+            input (torch.Tensor): Input tensor.
+        """
+        logits = self.gating(input)
+        scores, routing_map = self.routing(logits)
+        return scores, routing_map
+class PatchMoEMoELayer(nn.Module):
+    def __init__(self, config, layer_number):
+        super().__init__()
+        self.config = config
+        self.seq_length = config.seq_length
+        self.router = TopKRouter(config)
+        self.layer_number = layer_number
+        self.pred_length = config.pred_length
+        self.is_last_layer = self.layer_number == config.num_hidden_layers
+        if self.is_last_layer and self.config.heterogeneous_moe_layer:
+            self.expert_output_size = config.pred_length
+        else:
+            if self.config.do_expert_forecast:
+                self.expert_output_size = config.seq_length + config.pred_length
+            else:
+                self.expert_output_size = config.seq_length
+        if self.is_last_layer and self.config.heterogeneous_moe_layer:
+            # If heterogeneous_moe_layer is True, the backcast will be None
+            self.backcast_layernorm = None
+        else:
+            self.backcast_layernorm = RMSNorm(self.seq_length)
+        self.experts = SequentialPatchMoE(
+            config,
+            expert_output_size=self.expert_output_size,
+        )
+        self.shared_experts = PatchMoEExpert_v2(
+            config,
+            expert_output_size=self.expert_output_size,
+            patch_input_size=config.shared_patch_size,
+            final_layernorm=config.moe_expert_final_layernorm,
+        )
+    def time_series_preprocess(self, input: torch.Tensor):
+        """
+        Preprocess time series(sample) for dispatch.
+        Applies RevIN to input time series(sample), and process the input mask (0: mask, 1: unmask)
+        Args:
+            input (torch.Tensor): The input time series (samples) to the MoE layer. [batch_size, seq_len]
+        Returns:
+            input (torch.Tensor): The (RevIN) backcast time series (samples). [batch_size, seq_len]
+            means (torch.Tensor): The means of the non-masked backcast time series (samples). [batch_size, 1]
+            stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples). [batch_size, 1]
+        """
+        batch_size, seq_len = input.shape
+        assert seq_len == self.seq_length, f"seq_len {seq_len} != self.seq_length {self.seq_length}"
+        # Create input_mask based on pad_length
+        # When a time point is masked, its value is mask_pad_value(default:255.)
+        input_mask = (
+            input != self.config.mask_pad_value
+        )  # 0: mask, 1: unmask   [batch_size, seq_len]
+        self.input_mask = input_mask
+        return input
+    def router_and_preprocess(self, backcast: torch.Tensor):
+        """Compute and preprocess time series(sample) routing for dispatch.
+        This method uses the router to determine which experts to send each time series(sample) to,
+        producing routing probabilities and a mapping. It then preprocesses the
+        input time series (samples) and probabilities for the time series(sample) dispatcher. The original
+        input time series (samples) are returned as a residual connection.
+        """
+        # backcast [batch_size, seq_len]    means/stdev [batch_size, 1]
+        backcast = self.time_series_preprocess(backcast)
+        residual = backcast  # residual: [batch_size, seq_len], the input to the shared experts
+        # TODO: Check the effective of the masked value to the router
+        probs, routing_map = self.router(
+            backcast * self.input_mask
+        )  # probs/routing_map: [batch_size, num_experts]
+        return backcast, probs, residual, routing_map
+    def experts_compute(
+        self,
+        input: torch.Tensor,  # [num_permuted_samples_after_dispatch, seq_len]
+        probs: torch.Tensor,  # [num_permuted_samples_after_dispatch]
+        residual: torch.Tensor,  # [batch_size, seq_len]
+        rotary_pos_emb: torch.Tensor,
+        routing_map: torch.Tensor,  # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
+    ):
+        """Computes the output of the experts on the dispatched time series(sample).
+        This method first post-processes the dispatched input to get permuted time series(sample)
+        for each expert. It then passes the time series(sample) through the local experts.
+        If a shared expert is configured and not overlapped with communication,
+        it is also applied. The output from the experts is preprocessed for the
+        combine step.
+        """
+        # shared_expert_output: [batch_size, seq_len (+ pred_len)]
+        shared_experts_output = self.shared_experts(residual, rotary_pos_emb)
+        # dispatched_input (global_input_tokens):   [num_permuted_samples_after_dispatch_postprocess(sorted), seq_len]
+        # tokens_per_expert (global_probs):         [num_experts]
+        # permuted_probs (global_probs):            [num_permuted_samples_after_dispatch_postprocess(sorted)]
+        experts_output = self.experts(input, routing_map, rotary_pos_emb, probs)
+        return experts_output, shared_experts_output
+    def postprocess(
+        self,
+        backcast: torch.Tensor,  # [batch_size, seq_len]
+        forecast: torch.Tensor,  # [batch_size, pred_len]
+        output_backcast: torch.Tensor,  # [batch_size, seq_len]
+        output_forecast: torch.Tensor,  # [batch_size, pred_len]
+    ):
+        """
+        Args:
+            backcast (torch.Tensor): The previous layer's backcast time series (samples).                   [batch_size, seq_len]
+            forecast (torch.Tensor): The previous layer's forecast time series (samples).                   [batch_size, pred_len]
+            output_backcast (torch.Tensor): The current layer's output backcast time series (samples).      [batch_size, seq_len]
+            output_forecast (torch.Tensor): The current layer's output forecast time series (samples).      [batch_size, pred_len]
+            means (torch.Tensor): The means of the non-masked backcast time series (samples).               [batch_size, 1]
+            stdev (torch.Tensor): The standard deviation of the non-masked backcast time series (samples).  [batch_size, 1]
+            backcast_mask (torch.Tensor): The previous layer's backcast mask of time series (samples) .     [batch_size, seq_len]
+        """
+        if output_backcast is not None:
+            output_backcast = self.backcast_layernorm(output_backcast)  # LayerNorm
+            if self.config.residual_backcast:
+                output_backcast = backcast - output_backcast
+            output_backcast[~self.input_mask] = (
+                self.config.mask_pad_value
+            )  # Important! Recover the mask time point back to mask_pad_value(default:255.)
+        if (
+            self.config.do_expert_forecast and forecast is not None
+        ):  # The first layer's forecast is None
+            output_forecast = forecast + output_forecast
+        return output_backcast, output_forecast
+    def combine(
+        self,
+        experts_output: torch.Tensor,
+        shared_experts_output: torch.Tensor,
+    ):
+        """Combines expert outputs via communication and adds shared expert output.
+        This method uses the time series(sample) dispatcher to combine the outputs from different
+        experts (e.g., via an All-to-All communication). It then adds the output
+        from the shared expert if it exists.
+        """
+        assert (
+            experts_output.shape == shared_experts_output.shape
+        ), f"experts_output shape {experts_output.shape} doesn't equal to shared_experts_output shape:{shared_experts_output.shape}"
+        output = experts_output + shared_experts_output
+        if self.is_last_layer and self.config.heterogeneous_moe_layer:
+            output_backcast = None
+            output_forecast = output
+            assert (
+                output_forecast.shape[1] == self.pred_length
+            ), f"heterogeneous_moe_layer=True, expected the last moe layer's output pred len: {self.pred_length}, but got {output_forecast.shape[1]}"
+        else:
+            #  Noting: the mask time point there maybe not mask_pad_value(default:255.), it will be postprocessed
+            output_backcast = output[:, : self.seq_length]  # [batch_size, seq_len]
+            if self.config.do_expert_forecast:
+                output_forecast = output[:, self.seq_length :]  # [batch_size, pred_len]
+                assert (
+                    output_forecast.shape[1] == self.pred_length
+                ), f"do_expert_forecast=True, expected the last moe layer's output pred len: {self.pred_length}, but got {output_forecast.shape[1]}"
+            else:
+                output_forecast = None
+        return output_backcast, output_forecast
+    def forward(self, backcast, forecast, rotary_pos_emb):
+        inputs, probs, residual, routing_map = self.router_and_preprocess(backcast)
+        experts_output, shared_experts_output = self.experts_compute(
+            inputs, probs, residual, rotary_pos_emb, routing_map
+        )
+        output_backcast, output_forecast = self.combine(experts_output, shared_experts_output)
+        output_backcast, output_forecast = self.postprocess(
+            backcast, forecast, output_backcast, output_forecast
+        )
+        return output_backcast, output_forecast
+class PatchMoEBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                PatchMoEMoELayer(config, layer_num + 1)
+                for layer_num in range(self.config.num_hidden_layers)
+            ]
+        )
+    def forward(self, x, rotary_pos_emb):
+        backcast = x
+        forecast = None
+        for layer in self.layers:
+            backcast, forecast = layer(backcast, forecast, rotary_pos_emb)
+        return backcast, forecast
+class PatchMoEPreTrainedModel(PreTrainedModel):
+    config_class = PatchMoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PatchMoEMoELayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class PatchMoEModel(PatchMoEPreTrainedModel):
+    def __init__(self, config: PatchMoeConfig):
+        super().__init__(config)
+        self.config = config
+        self.seq_length = config.seq_length
+        self.rotary_pos_emb = RotaryEmbedding(
+            kv_channels=self.config.kv_channels,
+            rotary_base=config.rotary_base,
+            use_cpu_initialization=self.config.use_cpu_initialization,
+            rotary_interleaved=self.config.rotary_interleaved,
+        )
+        self.decoder = PatchMoEBlock(config=config)
+        if self.config.do_expert_forecast and self.config.heterogeneous_moe_layer:
+            self.output_layer = IdentityOp()
+        else:
+            self.output_layer = nn.Linear(
+                in_features=self.seq_length,
+                out_features=self.config.pred_length,
+                bias=self.config.add_bias_linear,
+            )
+    def revin(
+        self,
+        input: Tensor,  # [batch_size, seq_len]
+        input_mask: Tensor,  # [batch_size, seq_len] 0:mask, 1:unmask
+    ):
+        """Normalization from Non-stationary Transformer"""
+        input_data = input * input_mask
+        sum_per_sample = torch.sum(
+            input_data, dim=1, keepdim=True
+        ).detach()  # [batch_size, 1], torch.bfloat16
+        count_per_sample = torch.sum(
+            input_mask, dim=1, keepdim=True
+        ).detach()  # [batch_size, 1], torch.int64
+        assert (
+            torch.any(count_per_sample == 0) == False
+        ), f"There is zero in count_per_sample, shape: {input[torch.where(count_per_sample.squeeze(1) == 0)[0]]}"
+        means = sum_per_sample / count_per_sample  # [batch_size, 1]
+        input_data = input_data - means
+        input_data = input_data * input_mask
+        var_per_sample = (
+            torch.sum(input_data**2, dim=1, keepdim=True).detach() / count_per_sample
+        )  # [batch_size, 1]
+        stdev = torch.sqrt(var_per_sample + 1e-9)
+        input_data = input_data / stdev
+        input_data = input_data * input_mask
+        # recover the mask_pad_value(default:255.)
+        input = input * ~(input_mask) + input_data
+        return input, means, stdev
+    def forward(self, input, revin):
+        batch_size, input_len = input.shape
+        if input_len > self.seq_length:
+            input = input[:, -self.seq_length :]
+        elif input_len < self.seq_length:
+            pad_len = self.seq_length - input_len
+            input = F.pad(
+                input, pad=(pad_len, 0), mode="constant", value=self.config.mask_pad_value
+            )
+        input_len = self.seq_length
+        input_mask = input != self.config.mask_pad_value
+        # Step1. RevIN
+        if revin:
+            input, means, stdev = self.revin(input, input_mask)
+        # Step2. Get rotary_pos_emb
+        # rotary_pos_emb [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        rotary_pos_emb = self.rotary_pos_emb(input_len, device=input.device)
+        # Step3. Do one-step inference to get mixed forecasts from multiple forecast heads
+        # mixed_pred: [batch_size, sum(multi_forecast_head)]
+        mixed_pred = self._inference_step(
+            input=input, input_mask=input_mask, rotary_pos_emb=rotary_pos_emb
+        )
+        # Step4. Based on the mixed forecasts, do auto-regressive inference according to
+        # the step list of each forecast head
+        if self.config.multi_forecast_head_type == "single":
+            final_output = self._auto_regressive_single_head(
+                input=input,
+                input_mask=input_mask,
+                patchmoe_forecast=mixed_pred,
+                rotary_pos_emb=rotary_pos_emb,
+            )
+        else:
+            raise NotImplementedError
+        # Step5. RevIN
+        if revin:
+            final_output = final_output * (stdev.repeat(1, self.config.inference_length))
+            final_output = final_output + (means.repeat(1, self.config.inference_length))
+        return final_output.detach().float()
+    def _inference_step(
+        self,
+        input,
+        input_mask,
+        rotary_pos_emb,
+    ):
+        if self.config.do_base_forecast:
+            base_forecast, _ = self.base_output_layer(input)
+        else:
+            base_forecast = None
+        decoder_backcast, decoder_forecast = self.decoder(
+            input,  # [batch_size, seq_len]
+            rotary_pos_emb,  # [input_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        )
+        if self.config.do_expert_forecast:
+            assert decoder_forecast is not None, f"decoder_forecast is None"
+            if self.config.heterogeneous_moe_layer:
+                decoder_forecast = self.output_layer(decoder_forecast)  # IdentityOp
+            else:
+                final_forecast = self.output_layer(decoder_backcast * input_mask)
+                decoder_forecast = decoder_forecast + final_forecast
+        else:
+            # The decoder_backcast contains the mask_pad_val(default:255.)
+            decoder_forecast, _ = self.output_layer(decoder_backcast * input_mask)
+        if self.config.do_base_forecast:
+            assert base_forecast is not None, f"base_forecast is None"
+            patchmoe_forecast = base_forecast + decoder_forecast
+        else:
+            patchmoe_forecast = decoder_forecast
+        return patchmoe_forecast
+    def _auto_regressive_single_head(
+        self,
+        input,  # [batch_size, seq_len]
+        input_mask,  # [batch_size, seq_len]
+        patchmoe_forecast,  # [batch_size, max(multi_forecast_head)]
+        rotary_pos_emb,  # [seq_len, 1, 1, kv_channels(hidden_size // num_heads)]
+        auto_regressive_strategy="from_long_to_short",
+    ):
+        """auto regressive prediction with [single] head"""
+        assert (
+            self.config.multi_forecast_head_type == "single"
+        ), f"_auto_regressive_single_head only support multi_forecast_head_type==single "
+        if auto_regressive_strategy == "from_long_to_short":
+            # From long to short
+            multi_forecast_head_list = sorted(self.config.multi_forecast_head_list, reverse=True)
+            final_output = patchmoe_forecast
+            while final_output.shape[1] < self.config.inference_length:
+                # adaptive choose the forecast head
+                remain_pred_len = self.config.inference_length - final_output.shape[1]
+                for idx, head_pred_len in enumerate(multi_forecast_head_list):
+                    if head_pred_len <= remain_pred_len:
+                        break
+                if idx == len(multi_forecast_head_list):
+                    idx = len(multi_forecast_head_list) - 1
+                head_pred_len = multi_forecast_head_list[idx]
+                # one-step model prediction
+                input = torch.cat([input, patchmoe_forecast], dim=1)[
+                    :, -self.seq_length :
+                ].contiguous()
+                input_mask = torch.cat(
+                    [
+                        input_mask,
+                        torch.ones(
+                            patchmoe_forecast.shape,
+                            dtype=input_mask.dtype,
+                            device=input_mask.device,
+                        ),
+                    ],
+                    dim=1,
+                )[
+                    :, -self.seq_length :
+                ].contiguous()  # 0:mask, 1:unmask
+                patchmoe_forecast = self._inference_step(
+                    input=input,
+                    input_mask=input_mask,
+                    rotary_pos_emb=rotary_pos_emb,
+                )
+                # the core idea of multi forecast head type of [single]
+                patchmoe_forecast = patchmoe_forecast[:, :head_pred_len]
+                final_output = torch.cat([final_output, patchmoe_forecast], dim=1)
+            final_output = final_output[:, : self.config.inference_length]
+        elif auto_regressive_strategy == "from_short_to_long":
+            # From short to long
+            # in validate_args, it has been sorted, and check the valid config
+            multi_forecast_head_list = sorted(self.config.multi_forecast_head_list)
+            multi_forecast_head_dict = {}
+            for idx, head_pred_len in enumerate(self.config.multi_forecast_head_list):
+                if idx == len(multi_forecast_head_list) - 1:
+                    ar_step = math.ceil(self.config.inference_length / head_pred_len)
+                else:
+                    ar_step = min(
+                        self.config.autoregressive_step_list[idx],
+                        self.config.multi_forecast_head_list[idx + 1]
+                        // self.config.multi_forecast_head_list[idx],
+                    )
+                    # ar_step = multi_forecast_head_list[idx + 1] // multi_forecast_head_list[idx]
+                multi_forecast_head_dict[head_pred_len] = ar_step
+            # the core idea of strategy [from_short_to_long]
+            mixed_pred = patchmoe_forecast
+            output_list = []
+            cur_pred = None
+            cur_pred_len = 0
+            # from the first(shortest) as begining
+            for idx, head_pred_len in enumerate(self.config.multi_forecast_head_list):
+                # assert cur_pred_len <= head_pred_len, \
+                # "Accumulated prediction length exceeds the prediction length of current forecast head"
+                ar_step = multi_forecast_head_dict[head_pred_len]
+                if ar_step == 0:
+                    # Ignore the current forecast head
+                    continue
+                # Add current head's first auto-regressive step of prediction
+                head_pred = mixed_pred[:, :head_pred_len]  # [single]
+                output_list.append(head_pred[:, cur_pred_len:])
+                cur_pred = torch.cat(output_list, dim=1)
+                cur_pred_len = cur_pred.shape[1]
+                if cur_pred_len >= self.config.inference_length:
+                    break
+                # Do auto-regressive of the rest of the steps
+                for _ in range(1, ar_step + 1):
+                    # one-step model prediction
+                    cur_input = torch.cat([input, cur_pred], dim=1)[
+                        :, -self.seq_length :
+                    ].contiguous()
+                    cur_input_mask = torch.cat(
+                        [
+                            input_mask,
+                            torch.ones(
+                                cur_pred.shape, dtype=input_mask.dtype, device=input_mask.device
+                            ),
+                        ],
+                        dim=1,
+                    )[
+                        :, -self.seq_length :
+                    ].contiguous()  # 0:mask, 1:unmask
+                    patchmoe_forecast = self._inference_step(
+                        input=cur_input,
+                        input_mask=cur_input_mask,
+                        rotary_pos_emb=rotary_pos_emb,
+                    )
+                    head_pred = patchmoe_forecast[:, :head_pred_len]
+                    output_list.append(head_pred)
+                    cur_pred = torch.cat(output_list, dim=1)
+                    cur_pred_len = cur_pred.shape[1]
+                    if cur_pred_len >= self.config.inference_length:
+                        break
+                if cur_pred_len >= self.config.inference_length:
+                    break
+            final_output = cur_pred[
+                :, : self.config.inference_length
+            ]  # [batch_size, inference_len]
+        assert final_output.shape[1] == self.config.inference_length
+        return final_output
+class PatchMoEForPrediction(PatchMoEPreTrainedModel, PatchMoEGenerationMixin):
+    def __init__(self, config: PatchMoeConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = PatchMoEModel(self.config)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = False,
+        max_output_length: Optional[int] = None,
+        revin: Optional[bool] = False,
+    ):
+        self.model.config.inference_length = max_output_length
+        outputs = self.model(input=input_ids, revin=revin)
+        loss = None
+        logits = outputs
+        if labels is not None:
+            loss_fn = nn.MSELoss()
+            loss = loss_fn(logits, labels)
+        if not return_dict:
+            output = (logits,)
+            return ((loss,) + output) if loss is not None else output
+        return logits
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        revin=False,
+        **kwargs,
+    ):
+        """
+        Prepare model inputs for autoregressive generation.
+        """
+        model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "revin": revin,
+            }
+        )
+        return model_inputs

ts_generation_mixin.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Time Series Generation Mixin for PatchMoE
+This module provides generation capabilities specifically designed for time series
+forecasting tasks. It extends the standard Transformers GenerationMixin to handle
+time series data with proper input/output reshaping and autoregressive generation.
+"""
+from typing import List, Optional, Union, Callable
+import torch
+from transformers import GenerationMixin, LogitsProcessorList, StoppingCriteriaList
+from transformers.generation.utils import (
+    GenerateNonBeamOutput,
+    GenerationConfig,
+    GenerateOutput,
+)
+class PatchMoEGenerationMixin(GenerationMixin):
+    """
+    Generation mixin class for PatchMoE time series forecasting.
+    This class extends the standard Transformers GenerationMixin to provide
+    specialized generation capabilities for time series data, including proper
+    handling of multi-channel inputs and autoregressive forecasting.
+    """
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        revin: Optional[bool] = True,
+        num_samples: Optional[int] = 1,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        """
+        Generate time series forecasts using the PatchMoE model.
+        This method handles the generation of time series forecasts with proper
+        input preprocessing and output postprocessing for multi-channel data.
+        Args:
+            inputs (torch.Tensor): Input time series data of shape:
+                - [batch_size, seq_len] for single-channel
+                - [batch_size, seq_len, channels] for multi-channel
+            generation_config (GenerationConfig, optional): Generation configuration
+            logits_processor (LogitsProcessorList, optional): Logits processors
+            stopping_criteria (StoppingCriteriaList, optional): Stopping criteria
+            prefix_allowed_tokens_fn (Callable, optional): Prefix token function
+            synced_gpus (bool, optional): Whether to sync GPUs
+            assistant_model (PreTrainedModel, optional): Assistant model
+            streamer (BaseStreamer, optional): Output streamer
+            negative_prompt_ids (torch.Tensor, optional): Negative prompt IDs
+            negative_prompt_attention_mask (torch.Tensor, optional): Negative attention mask
+            revin (bool, optional): Whether to apply RevIN normalization
+            num_samples (int, optional): Number of samples to generate
+            **kwargs: Additional keyword arguments
+        Returns:
+            torch.Tensor: Generated forecasts of shape [batch_size, pred_len, channels]
+        Raises:
+            ValueError: If input shape is not supported
+        """
+        # Extract input dimensions
+        batch_size = inputs.shape[0]
+        length = inputs.shape[1]
+        channel = 1
+        # Handle multi-channel inputs
+        if len(inputs.shape) == 3:
+            channel = inputs.shape[2]
+            # Reshape to [batch_size * channels, seq_len] for processing
+            inputs = inputs.reshape(batch_size * channel, length)
+        elif len(inputs.shape) > 3:
+            raise ValueError("Input shape must be [batch, seq_len, channel] or [batch, seq_len]")
+        # Call parent generation method
+        outputs = super().generate(
+            inputs=inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            revin=revin,
+            **kwargs,
+        )
+        # Reshape outputs back to [batch_size, pred_len, channels]
+        pred_len = outputs.shape[1]
+        outputs = outputs.reshape(batch_size, channel, pred_len)
+        outputs = outputs.transpose(1, 2).contiguous()
+        return outputs
+    def _greedy_search(
+        self,
+        input_ids: torch.Tensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        output_logits: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: bool = False,
+        streamer: Optional["BaseStreamer"] = None,
+        **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.Tensor]:
+        """
+        Perform greedy search generation for time series forecasting.
+        This method implements greedy decoding specifically for time series data,
+        where the model generates forecasts autoregressively.
+        Args:
+            input_ids (torch.Tensor): Input time series data
+            logits_processor (LogitsProcessorList, optional): Logits processors
+            stopping_criteria (StoppingCriteriaList, optional): Stopping criteria
+            max_length (int, optional): Maximum generation length
+            pad_token_id (int, optional): Padding token ID (not used for time series)
+            eos_token_id (int or List[int], optional): End-of-sequence token ID
+            output_attentions (bool, optional): Whether to output attentions
+            output_hidden_states (bool, optional): Whether to output hidden states
+            output_scores (bool, optional): Whether to output scores
+            output_logits (bool, optional): Whether to output logits
+            return_dict_in_generate (bool, optional): Whether to return dict
+            synced_gpus (bool): Whether to sync GPUs
+            streamer (BaseStreamer, optional): Output streamer
+            **model_kwargs: Additional model arguments
+        Returns:
+            torch.Tensor: Generated time series forecasts
+        """
+        # Move inputs to model device
+        input_ids = input_ids.to(self.device)
+        batch_size, cur_len = input_ids.shape
+        # Initialize processors and criteria if not provided
+        logits_processor = (
+            logits_processor if logits_processor is not None else LogitsProcessorList()
+        )
+        stopping_criteria = (
+            stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        )
+        # Prepare model inputs for generation
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        # Generate forecasts with specified output length
+        outputs = self(
+            **model_inputs,
+            return_dict=True,
+            max_output_length=stopping_criteria.max_length - cur_len,
+        )
+        return outputs