diff --git "a/.cache/MatMulNBits_2_0_meta.json" "b/.cache/MatMulNBits_2_0_meta.json"
new file mode 100644--- /dev/null
+++ "b/.cache/MatMulNBits_2_0_meta.json"
@@ -0,0 +1,36155 @@
+{
+  "dd_meta_major_version": 1,
+  "dd_meta_minor_version": 4,
+  "state_table_updates": [
+    {
+      "state_table_idx": 0,
+      "update_func": 1,
+      "update_arg": 1
+    }
+  ],
+  "op_list": [
+    {
+      "name": "MatMulNBits_2_0",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.0.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "3",
+            "1"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.0/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0",
+        "past_key_values.0.key",
+        "past_key_values.0.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0",
+        "present.0.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "0",
+            "0",
+            "2",
+            "0",
+            "1",
+            "1",
+            "6",
+            "0",
+            "2",
+            "0"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0"
+      ],
+      "const_args": [
+        "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_0",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/embed_tokens/Gather/output_0.out4_0",
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1"
+      ],
+      "const_args": [
+        "model.layers.0.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_0",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0"
+      ],
+      "const_args": [
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.0/mlp/Mul/output_0.out3_0"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.0.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.0/mlp/Mul/output_0.out3_0"
+      ],
+      "const_args": [
+        "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_1",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2"
+      ],
+      "const_args": [
+        "model.layers.1.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_1",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_0.out4_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.1.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "7",
+            "3"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.1/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3",
+        "past_key_values.1.key",
+        "past_key_values.1.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1",
+        "present.1.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "4",
+            "2",
+            "2",
+            "0",
+            "5",
+            "3",
+            "6",
+            "0",
+            "6",
+            "2"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1"
+      ],
+      "const_args": [
+        "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_2",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4"
+      ],
+      "const_args": [
+        "model.layers.1.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_1",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2"
+      ],
+      "const_args": [
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.1/mlp/Mul/output_0.out3_1"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.1.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.1/mlp/Mul/output_0.out3_1"
+      ],
+      "const_args": [
+        "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_3",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5"
+      ],
+      "const_args": [
+        "model.layers.2.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_2",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "const_args": [
+        "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_0.out4_3"
+      ],
+      "const_args": [
+        "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.2.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "11",
+            "5"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.2/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6",
+        "past_key_values.2.key",
+        "past_key_values.2.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2",
+        "present.2.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "8",
+            "4",
+            "2",
+            "0",
+            "9",
+            "5",
+            "6",
+            "0",
+            "10",
+            "4"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2"
+      ],
+      "const_args": [
+        "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_4",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7"
+      ],
+      "const_args": [
+        "model.layers.2.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_2",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4"
+      ],
+      "const_args": [
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.2/mlp/Mul/output_0.out3_2"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.2.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.2/mlp/Mul/output_0.out3_2"
+      ],
+      "const_args": [
+        "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_5",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8"
+      ],
+      "const_args": [
+        "model.layers.3.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_3",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "const_args": [
+        "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_0.out4_5"
+      ],
+      "const_args": [
+        "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.3.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "15",
+            "7"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.3/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9",
+        "past_key_values.3.key",
+        "past_key_values.3.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3",
+        "present.3.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "12",
+            "6",
+            "2",
+            "0",
+            "13",
+            "7",
+            "6",
+            "0",
+            "14",
+            "6"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3"
+      ],
+      "const_args": [
+        "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_6",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10"
+      ],
+      "const_args": [
+        "model.layers.3.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_3",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6"
+      ],
+      "const_args": [
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.3/mlp/Mul/output_0.out3_3"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.3.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.3/mlp/Mul/output_0.out3_3"
+      ],
+      "const_args": [
+        "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_7",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11"
+      ],
+      "const_args": [
+        "model.layers.4.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_4",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "const_args": [
+        "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_0.out4_7"
+      ],
+      "const_args": [
+        "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.4.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "19",
+            "9"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.4/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12",
+        "past_key_values.4.key",
+        "past_key_values.4.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4",
+        "present.4.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "16",
+            "8",
+            "2",
+            "0",
+            "17",
+            "9",
+            "6",
+            "0",
+            "18",
+            "8"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4"
+      ],
+      "const_args": [
+        "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_8",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13"
+      ],
+      "const_args": [
+        "model.layers.4.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_4",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8"
+      ],
+      "const_args": [
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.4/mlp/Mul/output_0.out3_4"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.4.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.4/mlp/Mul/output_0.out3_4"
+      ],
+      "const_args": [
+        "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_9",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14"
+      ],
+      "const_args": [
+        "model.layers.5.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_5",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "const_args": [
+        "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_0.out4_9"
+      ],
+      "const_args": [
+        "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.5.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "23",
+            "11"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.5/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15",
+        "past_key_values.5.key",
+        "past_key_values.5.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5",
+        "present.5.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "20",
+            "10",
+            "2",
+            "0",
+            "21",
+            "11",
+            "6",
+            "0",
+            "22",
+            "10"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5"
+      ],
+      "const_args": [
+        "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_10",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16"
+      ],
+      "const_args": [
+        "model.layers.5.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_5",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10"
+      ],
+      "const_args": [
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.5/mlp/Mul/output_0.out3_5"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.5.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.5/mlp/Mul/output_0.out3_5"
+      ],
+      "const_args": [
+        "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_11",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17"
+      ],
+      "const_args": [
+        "model.layers.6.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_6",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "const_args": [
+        "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_0.out4_11"
+      ],
+      "const_args": [
+        "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.6.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "27",
+            "13"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.6/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18",
+        "past_key_values.6.key",
+        "past_key_values.6.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6",
+        "present.6.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "24",
+            "12",
+            "2",
+            "0",
+            "25",
+            "13",
+            "6",
+            "0",
+            "26",
+            "12"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6"
+      ],
+      "const_args": [
+        "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_12",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19"
+      ],
+      "const_args": [
+        "model.layers.6.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_6",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12"
+      ],
+      "const_args": [
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.6/mlp/Mul/output_0.out3_6"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.6.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.6/mlp/Mul/output_0.out3_6"
+      ],
+      "const_args": [
+        "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_13",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20"
+      ],
+      "const_args": [
+        "model.layers.7.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_7",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "const_args": [
+        "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_0.out4_13"
+      ],
+      "const_args": [
+        "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.7.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "31",
+            "15"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.7/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21",
+        "past_key_values.7.key",
+        "past_key_values.7.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7",
+        "present.7.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "28",
+            "14",
+            "2",
+            "0",
+            "29",
+            "15",
+            "6",
+            "0",
+            "30",
+            "14"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7"
+      ],
+      "const_args": [
+        "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_14",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22"
+      ],
+      "const_args": [
+        "model.layers.7.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_7",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14"
+      ],
+      "const_args": [
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.7/mlp/Mul/output_0.out3_7"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.7.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.7/mlp/Mul/output_0.out3_7"
+      ],
+      "const_args": [
+        "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_15",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23"
+      ],
+      "const_args": [
+        "model.layers.8.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_8",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "const_args": [
+        "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_0.out4_15"
+      ],
+      "const_args": [
+        "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.8.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "35",
+            "17"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.8/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24",
+        "past_key_values.8.key",
+        "past_key_values.8.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8",
+        "present.8.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "32",
+            "16",
+            "2",
+            "0",
+            "33",
+            "17",
+            "6",
+            "0",
+            "34",
+            "16"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8"
+      ],
+      "const_args": [
+        "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_16",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25"
+      ],
+      "const_args": [
+        "model.layers.8.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_8",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16"
+      ],
+      "const_args": [
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.8/mlp/Mul/output_0.out3_8"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.8.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.8/mlp/Mul/output_0.out3_8"
+      ],
+      "const_args": [
+        "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_17",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26"
+      ],
+      "const_args": [
+        "model.layers.9.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_9",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "const_args": [
+        "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_0.out4_17"
+      ],
+      "const_args": [
+        "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.9.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "39",
+            "19"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.9/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27",
+        "past_key_values.9.key",
+        "past_key_values.9.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9",
+        "present.9.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "36",
+            "18",
+            "2",
+            "0",
+            "37",
+            "19",
+            "6",
+            "0",
+            "38",
+            "18"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9"
+      ],
+      "const_args": [
+        "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_18",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28"
+      ],
+      "const_args": [
+        "model.layers.9.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_9",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18"
+      ],
+      "const_args": [
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.9/mlp/Mul/output_0.out3_9"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.9.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.9/mlp/Mul/output_0.out3_9"
+      ],
+      "const_args": [
+        "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_19",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29"
+      ],
+      "const_args": [
+        "model.layers.10.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_10",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "const_args": [
+        "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_0.out4_19"
+      ],
+      "const_args": [
+        "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.10.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "43",
+            "21"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.10/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30",
+        "past_key_values.10.key",
+        "past_key_values.10.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10",
+        "present.10.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "40",
+            "20",
+            "2",
+            "0",
+            "41",
+            "21",
+            "6",
+            "0",
+            "42",
+            "20"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10"
+      ],
+      "const_args": [
+        "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_20",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31"
+      ],
+      "const_args": [
+        "model.layers.10.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_10",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20"
+      ],
+      "const_args": [
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.10/mlp/Mul/output_0.out3_10"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.10.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.10/mlp/Mul/output_0.out3_10"
+      ],
+      "const_args": [
+        "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_21",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32"
+      ],
+      "const_args": [
+        "model.layers.11.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_11",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "const_args": [
+        "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_0.out4_21"
+      ],
+      "const_args": [
+        "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.11.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "47",
+            "23"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.11/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33",
+        "past_key_values.11.key",
+        "past_key_values.11.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11",
+        "present.11.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "44",
+            "22",
+            "2",
+            "0",
+            "45",
+            "23",
+            "6",
+            "0",
+            "46",
+            "22"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11"
+      ],
+      "const_args": [
+        "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_22",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34"
+      ],
+      "const_args": [
+        "model.layers.11.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_11",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22"
+      ],
+      "const_args": [
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.11/mlp/Mul/output_0.out3_11"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.11.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.11/mlp/Mul/output_0.out3_11"
+      ],
+      "const_args": [
+        "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_23",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35"
+      ],
+      "const_args": [
+        "model.layers.12.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_12",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "const_args": [
+        "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_0.out4_23"
+      ],
+      "const_args": [
+        "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.12.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "51",
+            "25"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.12/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36",
+        "past_key_values.12.key",
+        "past_key_values.12.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12",
+        "present.12.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "48",
+            "24",
+            "2",
+            "0",
+            "49",
+            "25",
+            "6",
+            "0",
+            "50",
+            "24"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12"
+      ],
+      "const_args": [
+        "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_24",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37"
+      ],
+      "const_args": [
+        "model.layers.12.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_12",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24"
+      ],
+      "const_args": [
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.12/mlp/Mul/output_0.out3_12"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.12.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.12/mlp/Mul/output_0.out3_12"
+      ],
+      "const_args": [
+        "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_25",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38"
+      ],
+      "const_args": [
+        "model.layers.13.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_13",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "const_args": [
+        "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_0.out4_25"
+      ],
+      "const_args": [
+        "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.13.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "55",
+            "27"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.13/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39",
+        "past_key_values.13.key",
+        "past_key_values.13.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13",
+        "present.13.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "52",
+            "26",
+            "2",
+            "0",
+            "53",
+            "27",
+            "6",
+            "0",
+            "54",
+            "26"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13"
+      ],
+      "const_args": [
+        "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_26",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40"
+      ],
+      "const_args": [
+        "model.layers.13.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_13",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26"
+      ],
+      "const_args": [
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.13/mlp/Mul/output_0.out3_13"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.13.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.13/mlp/Mul/output_0.out3_13"
+      ],
+      "const_args": [
+        "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_27",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41"
+      ],
+      "const_args": [
+        "model.layers.14.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_14",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "const_args": [
+        "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_0.out4_27"
+      ],
+      "const_args": [
+        "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.14.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "59",
+            "29"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.14/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42",
+        "past_key_values.14.key",
+        "past_key_values.14.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14",
+        "present.14.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "56",
+            "28",
+            "2",
+            "0",
+            "57",
+            "29",
+            "6",
+            "0",
+            "58",
+            "28"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14"
+      ],
+      "const_args": [
+        "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_28",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43"
+      ],
+      "const_args": [
+        "model.layers.14.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_14",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28"
+      ],
+      "const_args": [
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.14/mlp/Mul/output_0.out3_14"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.14.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.14/mlp/Mul/output_0.out3_14"
+      ],
+      "const_args": [
+        "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_29",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44"
+      ],
+      "const_args": [
+        "model.layers.15.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_15",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "const_args": [
+        "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_0.out4_29"
+      ],
+      "const_args": [
+        "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.15.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "63",
+            "31"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.15/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45",
+        "past_key_values.15.key",
+        "past_key_values.15.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15",
+        "present.15.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "60",
+            "30",
+            "2",
+            "0",
+            "61",
+            "31",
+            "6",
+            "0",
+            "62",
+            "30"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15"
+      ],
+      "const_args": [
+        "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_30",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46"
+      ],
+      "const_args": [
+        "model.layers.15.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_15",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30"
+      ],
+      "const_args": [
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.15/mlp/Mul/output_0.out3_15"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.15.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.15/mlp/Mul/output_0.out3_15"
+      ],
+      "const_args": [
+        "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_31",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47"
+      ],
+      "const_args": [
+        "model.layers.16.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_16",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "const_args": [
+        "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_0.out4_31"
+      ],
+      "const_args": [
+        "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.16.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "67",
+            "33"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.16/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48",
+        "past_key_values.16.key",
+        "past_key_values.16.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16",
+        "present.16.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "64",
+            "32",
+            "2",
+            "0",
+            "65",
+            "33",
+            "6",
+            "0",
+            "66",
+            "32"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16"
+      ],
+      "const_args": [
+        "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_32",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49"
+      ],
+      "const_args": [
+        "model.layers.16.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_16",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32"
+      ],
+      "const_args": [
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.16/mlp/Mul/output_0.out3_16"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.16.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.16/mlp/Mul/output_0.out3_16"
+      ],
+      "const_args": [
+        "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_33",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50"
+      ],
+      "const_args": [
+        "model.layers.17.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_17",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "const_args": [
+        "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_0.out4_33"
+      ],
+      "const_args": [
+        "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.17.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "71",
+            "35"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.17/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51",
+        "past_key_values.17.key",
+        "past_key_values.17.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17",
+        "present.17.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "68",
+            "34",
+            "2",
+            "0",
+            "69",
+            "35",
+            "6",
+            "0",
+            "70",
+            "34"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17"
+      ],
+      "const_args": [
+        "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_34",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52"
+      ],
+      "const_args": [
+        "model.layers.17.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_17",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34"
+      ],
+      "const_args": [
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.17/mlp/Mul/output_0.out3_17"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.17.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.17/mlp/Mul/output_0.out3_17"
+      ],
+      "const_args": [
+        "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_35",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53"
+      ],
+      "const_args": [
+        "model.layers.18.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_18",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "const_args": [
+        "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_0.out4_35"
+      ],
+      "const_args": [
+        "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.18.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "75",
+            "37"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.18/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54",
+        "past_key_values.18.key",
+        "past_key_values.18.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18",
+        "present.18.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "72",
+            "36",
+            "2",
+            "0",
+            "73",
+            "37",
+            "6",
+            "0",
+            "74",
+            "36"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18"
+      ],
+      "const_args": [
+        "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_36",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55"
+      ],
+      "const_args": [
+        "model.layers.18.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_18",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36"
+      ],
+      "const_args": [
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.18/mlp/Mul/output_0.out3_18"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.18.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.18/mlp/Mul/output_0.out3_18"
+      ],
+      "const_args": [
+        "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_37",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56"
+      ],
+      "const_args": [
+        "model.layers.19.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_19",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "const_args": [
+        "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_0.out4_37"
+      ],
+      "const_args": [
+        "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.19.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "79",
+            "39"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.19/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57",
+        "past_key_values.19.key",
+        "past_key_values.19.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19",
+        "present.19.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "76",
+            "38",
+            "2",
+            "0",
+            "77",
+            "39",
+            "6",
+            "0",
+            "78",
+            "38"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19"
+      ],
+      "const_args": [
+        "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_38",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58"
+      ],
+      "const_args": [
+        "model.layers.19.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_19",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38"
+      ],
+      "const_args": [
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.19/mlp/Mul/output_0.out3_19"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.19.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.19/mlp/Mul/output_0.out3_19"
+      ],
+      "const_args": [
+        "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_39",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59"
+      ],
+      "const_args": [
+        "model.layers.20.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_20",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "const_args": [
+        "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_0.out4_39"
+      ],
+      "const_args": [
+        "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.20.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "83",
+            "41"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.20/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60",
+        "past_key_values.20.key",
+        "past_key_values.20.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20",
+        "present.20.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "80",
+            "40",
+            "2",
+            "0",
+            "81",
+            "41",
+            "6",
+            "0",
+            "82",
+            "40"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20"
+      ],
+      "const_args": [
+        "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_40",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61"
+      ],
+      "const_args": [
+        "model.layers.20.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_20",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40"
+      ],
+      "const_args": [
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.20/mlp/Mul/output_0.out3_20"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.20.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.20/mlp/Mul/output_0.out3_20"
+      ],
+      "const_args": [
+        "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_41",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62"
+      ],
+      "const_args": [
+        "model.layers.21.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_21",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "const_args": [
+        "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_0.out4_41"
+      ],
+      "const_args": [
+        "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.21.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "87",
+            "43"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.21/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63",
+        "past_key_values.21.key",
+        "past_key_values.21.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21",
+        "present.21.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "84",
+            "42",
+            "2",
+            "0",
+            "85",
+            "43",
+            "6",
+            "0",
+            "86",
+            "42"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21"
+      ],
+      "const_args": [
+        "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_42",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64"
+      ],
+      "const_args": [
+        "model.layers.21.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_21",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42"
+      ],
+      "const_args": [
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.21/mlp/Mul/output_0.out3_21"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.21.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.21/mlp/Mul/output_0.out3_21"
+      ],
+      "const_args": [
+        "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_43",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65"
+      ],
+      "const_args": [
+        "model.layers.22.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_22",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "const_args": [
+        "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_0.out4_43"
+      ],
+      "const_args": [
+        "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.22.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "91",
+            "45"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.22/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66",
+        "past_key_values.22.key",
+        "past_key_values.22.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22",
+        "present.22.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "88",
+            "44",
+            "2",
+            "0",
+            "89",
+            "45",
+            "6",
+            "0",
+            "90",
+            "44"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22"
+      ],
+      "const_args": [
+        "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_44",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67"
+      ],
+      "const_args": [
+        "model.layers.22.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_22",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44"
+      ],
+      "const_args": [
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.22/mlp/Mul/output_0.out3_22"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.22.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.22/mlp/Mul/output_0.out3_22"
+      ],
+      "const_args": [
+        "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_45",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68"
+      ],
+      "const_args": [
+        "model.layers.23.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_23",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "const_args": [
+        "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_0.out4_45"
+      ],
+      "const_args": [
+        "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.23.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "95",
+            "47"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.23/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69",
+        "past_key_values.23.key",
+        "past_key_values.23.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23",
+        "present.23.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "92",
+            "46",
+            "2",
+            "0",
+            "93",
+            "47",
+            "6",
+            "0",
+            "94",
+            "46"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23"
+      ],
+      "const_args": [
+        "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_46",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70"
+      ],
+      "const_args": [
+        "model.layers.23.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_23",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46"
+      ],
+      "const_args": [
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.23/mlp/Mul/output_0.out3_23"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.23.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.23/mlp/Mul/output_0.out3_23"
+      ],
+      "const_args": [
+        "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_47",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71"
+      ],
+      "const_args": [
+        "model.layers.24.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_24",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "const_args": [
+        "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_0.out4_47"
+      ],
+      "const_args": [
+        "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.24.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "99",
+            "49"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.24/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72",
+        "past_key_values.24.key",
+        "past_key_values.24.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24",
+        "present.24.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "96",
+            "48",
+            "2",
+            "0",
+            "97",
+            "49",
+            "6",
+            "0",
+            "98",
+            "48"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24"
+      ],
+      "const_args": [
+        "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_48",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73"
+      ],
+      "const_args": [
+        "model.layers.24.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_24",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48"
+      ],
+      "const_args": [
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.24/mlp/Mul/output_0.out3_24"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.24.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.24/mlp/Mul/output_0.out3_24"
+      ],
+      "const_args": [
+        "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_49",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74"
+      ],
+      "const_args": [
+        "model.layers.25.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_25",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "const_args": [
+        "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_0.out4_49"
+      ],
+      "const_args": [
+        "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.25.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "103",
+            "51"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.25/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75",
+        "past_key_values.25.key",
+        "past_key_values.25.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25",
+        "present.25.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "100",
+            "50",
+            "2",
+            "0",
+            "101",
+            "51",
+            "6",
+            "0",
+            "102",
+            "50"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25"
+      ],
+      "const_args": [
+        "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_50",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76"
+      ],
+      "const_args": [
+        "model.layers.25.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_25",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50"
+      ],
+      "const_args": [
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.25/mlp/Mul/output_0.out3_25"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.25.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.25/mlp/Mul/output_0.out3_25"
+      ],
+      "const_args": [
+        "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_51",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77"
+      ],
+      "const_args": [
+        "model.layers.26.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_26",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "const_args": [
+        "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_0.out4_51"
+      ],
+      "const_args": [
+        "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.26.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "107",
+            "53"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.26/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78",
+        "past_key_values.26.key",
+        "past_key_values.26.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26",
+        "present.26.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "104",
+            "52",
+            "2",
+            "0",
+            "105",
+            "53",
+            "6",
+            "0",
+            "106",
+            "52"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26"
+      ],
+      "const_args": [
+        "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_52",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79"
+      ],
+      "const_args": [
+        "model.layers.26.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_26",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52"
+      ],
+      "const_args": [
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.26/mlp/Mul/output_0.out3_26"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.26.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.26/mlp/Mul/output_0.out3_26"
+      ],
+      "const_args": [
+        "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_53",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80"
+      ],
+      "const_args": [
+        "model.layers.27.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_27",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "const_args": [
+        "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_0.out4_53"
+      ],
+      "const_args": [
+        "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.27.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "111",
+            "55"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.27/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81",
+        "past_key_values.27.key",
+        "past_key_values.27.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27",
+        "present.27.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "108",
+            "54",
+            "2",
+            "0",
+            "109",
+            "55",
+            "6",
+            "0",
+            "110",
+            "54"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27"
+      ],
+      "const_args": [
+        "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_54",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82"
+      ],
+      "const_args": [
+        "model.layers.27.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_27",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54"
+      ],
+      "const_args": [
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.27/mlp/Mul/output_0.out3_27"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.27.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.27/mlp/Mul/output_0.out3_27"
+      ],
+      "const_args": [
+        "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_55",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83"
+      ],
+      "const_args": [
+        "model.layers.28.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_28",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "const_args": [
+        "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_0.out4_55"
+      ],
+      "const_args": [
+        "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.28.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "115",
+            "57"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.28/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84",
+        "past_key_values.28.key",
+        "past_key_values.28.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28",
+        "present.28.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "112",
+            "56",
+            "2",
+            "0",
+            "113",
+            "57",
+            "6",
+            "0",
+            "114",
+            "56"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28"
+      ],
+      "const_args": [
+        "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_56",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85"
+      ],
+      "const_args": [
+        "model.layers.28.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_28",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56"
+      ],
+      "const_args": [
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.28/mlp/Mul/output_0.out3_28"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.28.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.28/mlp/Mul/output_0.out3_28"
+      ],
+      "const_args": [
+        "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_57",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86"
+      ],
+      "const_args": [
+        "model.layers.29.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_29",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "const_args": [
+        "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_0.out4_57"
+      ],
+      "const_args": [
+        "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.29.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "119",
+            "59"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.29/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87",
+        "past_key_values.29.key",
+        "past_key_values.29.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29",
+        "present.29.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "116",
+            "58",
+            "2",
+            "0",
+            "117",
+            "59",
+            "6",
+            "0",
+            "118",
+            "58"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29"
+      ],
+      "const_args": [
+        "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_58",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88"
+      ],
+      "const_args": [
+        "model.layers.29.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_29",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58"
+      ],
+      "const_args": [
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.29/mlp/Mul/output_0.out3_29"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.29.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.29/mlp/Mul/output_0.out3_29"
+      ],
+      "const_args": [
+        "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_59",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89"
+      ],
+      "const_args": [
+        "model.layers.30.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_30",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "const_args": [
+        "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_0.out4_59"
+      ],
+      "const_args": [
+        "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.30.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "123",
+            "61"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.30/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90",
+        "past_key_values.30.key",
+        "past_key_values.30.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30",
+        "present.30.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "120",
+            "60",
+            "2",
+            "0",
+            "121",
+            "61",
+            "6",
+            "0",
+            "122",
+            "60"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30"
+      ],
+      "const_args": [
+        "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_60",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91"
+      ],
+      "const_args": [
+        "model.layers.30.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_30",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60"
+      ],
+      "const_args": [
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.30/mlp/Mul/output_0.out3_30"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.30.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.30/mlp/Mul/output_0.out3_30"
+      ],
+      "const_args": [
+        "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_61",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92"
+      ],
+      "const_args": [
+        "model.layers.31.input_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "MatMulNBits_2_31",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "const_args": [
+        "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "5120"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.attn.v_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_0.out4_61"
+      ],
+      "const_args": [
+        "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "present.31.value"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "1024"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "total_seq_len": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "127",
+            "63"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "5",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/model/layers.31/attn/GroupQueryAttention",
+      "type": "FLATMHA",
+      "in_args": [
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93",
+        "past_key_values.31.key",
+        "past_key_values.31.value",
+        "attention_mask_const_uint",
+        "sin_cos_cache_token"
+      ],
+      "const_args": [],
+      "out_args": [
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31",
+        "present.31.key"
+      ],
+      "attrs": {
+        "num_heads": {
+          "type": "int",
+          "value": [
+            "32"
+          ]
+        },
+        "kv_num_heads": {
+          "type": "int",
+          "value": [
+            "8"
+          ]
+        },
+        "scale": {
+          "type": "float",
+          "value": [
+            "0.0883883461356163"
+          ]
+        },
+        "softcap": {
+          "type": "float",
+          "value": [
+            "0.0"
+          ]
+        },
+        "do_rotary": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "rotary_interleaved": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "8",
+            "32",
+            "1",
+            "4096",
+            "128"
+          ]
+        },
+        "external_buffers": {
+          "type": "int",
+          "value": [
+            "4",
+            "1",
+            "0",
+            "0",
+            "1",
+            "0",
+            "124",
+            "62",
+            "2",
+            "0",
+            "125",
+            "63",
+            "6",
+            "0",
+            "126",
+            "62"
+          ]
+        },
+        "update_tensor_offsets": {
+          "type": "int",
+          "value": [
+            "4",
+            "0",
+            "0",
+            "256",
+            "6",
+            "0",
+            "0",
+            "256"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.attn.o_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31"
+      ],
+      "const_args": [
+        "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_62",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94"
+      ],
+      "const_args": [
+        "model.layers.31.post_attention_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatMLP_3_31",
+      "type": "FlatMLP",
+      "in_args": [
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62"
+      ],
+      "const_args": [
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.up_proj.MatMulNBits.bias.f"
+      ],
+      "out_args": [
+        "/model/layers.31/mlp/Mul/output_0.out3_31"
+      ],
+      "attrs": {
+        "input_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "4096",
+            "14336"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float",
+            "uint8",
+            "float"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16"
+          ]
+        }
+      }
+    },
+    {
+      "name": "layers.31.mlp.down_proj",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.31/mlp/Mul/output_0.out3_31"
+      ],
+      "const_args": [
+        "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "14336"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    },
+    {
+      "name": "FlatRMSAdd_4_63",
+      "type": "FlatRMSAdd",
+      "in_args": [
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95"
+      ],
+      "const_args": [
+        "model.layers.32.final_norm_layernorm.weight.bf"
+      ],
+      "out_args": [
+        "/model/layers.32/final_norm_layernorm/output_0.dummy",
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ],
+      "attrs": {
+        "a_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "in_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "out_dtypes": {
+          "type": "str",
+          "value": [
+            "bfloat16",
+            "bfloat16"
+          ]
+        },
+        "c_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "b_shape": {
+          "type": "int",
+          "value": [
+            "1",
+            "1",
+            "4096"
+          ]
+        },
+        "is_gamma_ifm": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        }
+      }
+    },
+    {
+      "name": "/lm_head/MatMulNBits",
+      "type": "MladfMatMul",
+      "in_args": [
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ],
+      "const_args": [
+        "lm_head.MatMulNBits.qweight.preformat",
+        "lm_head.MatMulNBits.bias.preformat",
+        "lm_head.MatMulNBits.scales.preformat",
+        "lm_head.MatMulNBits.qzeros.preformat"
+      ],
+      "out_args": [
+        "logits.out5_4_96"
+      ],
+      "attrs": {
+        "accuracy_level": {
+          "type": "int",
+          "value": [
+            "0"
+          ]
+        },
+        "bits": {
+          "type": "int",
+          "value": [
+            "4"
+          ]
+        },
+        "block_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        },
+        "K": {
+          "type": "int",
+          "value": [
+            "4096"
+          ]
+        },
+        "N": {
+          "type": "int",
+          "value": [
+            "32768"
+          ]
+        },
+        "default_shape": {
+          "type": "int",
+          "value": [
+            "1"
+          ]
+        },
+        "op_version": {
+          "type": "str",
+          "value": [
+            "flat"
+          ]
+        },
+        "group_size": {
+          "type": "int",
+          "value": [
+            "128"
+          ]
+        }
+      }
+    }
+  ],
+  "fused_tensors": {
+    "in": {
+      "buffer_size": 24704,
+      "xrt_arg_id": 0,
+      "packed_tensors": [
+        "/model/layers.0/input_layernorm/output_0.out5_4_0",
+        "attention_mask_const_uint",
+        "/model/embed_tokens/Gather/output_0.out4_0"
+      ]
+    },
+    "out": {
+      "buffer_size": 73728,
+      "xrt_arg_id": 1,
+      "packed_tensors": [
+        "/model/layers.32/final_norm_layernorm/output_0.dummy",
+        "logits.out5_4_96"
+      ]
+    },
+    "scratch": {
+      "buffer_size": 3072000,
+      "xrt_arg_id": 2,
+      "packed_tensors": [
+        "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0",
+        "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0",
+        "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1",
+        "/model/layers.0/post_attention_layernorm/output_3.out4_0",
+        "/model/layers.0/post_attention_layernorm/output_0.out4_0",
+        "/model/layers.0/mlp/Mul/output_0.out3_0",
+        "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2",
+        "/model/layers.1/input_layernorm/output_3.out4_1",
+        "/model/layers.1/input_layernorm/output_0.out4_1",
+        "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3",
+        "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1",
+        "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4",
+        "/model/layers.1/post_attention_layernorm/output_3.out4_2",
+        "/model/layers.1/post_attention_layernorm/output_0.out4_2",
+        "/model/layers.1/mlp/Mul/output_0.out3_1",
+        "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5",
+        "/model/layers.2/input_layernorm/output_3.out4_3",
+        "/model/layers.2/input_layernorm/output_0.out4_3",
+        "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6",
+        "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2",
+        "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7",
+        "/model/layers.2/post_attention_layernorm/output_3.out4_4",
+        "/model/layers.2/post_attention_layernorm/output_0.out4_4",
+        "/model/layers.2/mlp/Mul/output_0.out3_2",
+        "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8",
+        "/model/layers.3/input_layernorm/output_3.out4_5",
+        "/model/layers.3/input_layernorm/output_0.out4_5",
+        "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9",
+        "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3",
+        "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10",
+        "/model/layers.3/post_attention_layernorm/output_3.out4_6",
+        "/model/layers.3/post_attention_layernorm/output_0.out4_6",
+        "/model/layers.3/mlp/Mul/output_0.out3_3",
+        "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11",
+        "/model/layers.4/input_layernorm/output_3.out4_7",
+        "/model/layers.4/input_layernorm/output_0.out4_7",
+        "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12",
+        "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4",
+        "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13",
+        "/model/layers.4/post_attention_layernorm/output_3.out4_8",
+        "/model/layers.4/post_attention_layernorm/output_0.out4_8",
+        "/model/layers.4/mlp/Mul/output_0.out3_4",
+        "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14",
+        "/model/layers.5/input_layernorm/output_3.out4_9",
+        "/model/layers.5/input_layernorm/output_0.out4_9",
+        "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15",
+        "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5",
+        "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16",
+        "/model/layers.5/post_attention_layernorm/output_3.out4_10",
+        "/model/layers.5/post_attention_layernorm/output_0.out4_10",
+        "/model/layers.5/mlp/Mul/output_0.out3_5",
+        "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17",
+        "/model/layers.6/input_layernorm/output_3.out4_11",
+        "/model/layers.6/input_layernorm/output_0.out4_11",
+        "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18",
+        "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6",
+        "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19",
+        "/model/layers.6/post_attention_layernorm/output_3.out4_12",
+        "/model/layers.6/post_attention_layernorm/output_0.out4_12",
+        "/model/layers.6/mlp/Mul/output_0.out3_6",
+        "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20",
+        "/model/layers.7/input_layernorm/output_3.out4_13",
+        "/model/layers.7/input_layernorm/output_0.out4_13",
+        "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21",
+        "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7",
+        "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22",
+        "/model/layers.7/post_attention_layernorm/output_3.out4_14",
+        "/model/layers.7/post_attention_layernorm/output_0.out4_14",
+        "/model/layers.7/mlp/Mul/output_0.out3_7",
+        "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23",
+        "/model/layers.8/input_layernorm/output_3.out4_15",
+        "/model/layers.8/input_layernorm/output_0.out4_15",
+        "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24",
+        "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8",
+        "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25",
+        "/model/layers.8/post_attention_layernorm/output_3.out4_16",
+        "/model/layers.8/post_attention_layernorm/output_0.out4_16",
+        "/model/layers.8/mlp/Mul/output_0.out3_8",
+        "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26",
+        "/model/layers.9/input_layernorm/output_3.out4_17",
+        "/model/layers.9/input_layernorm/output_0.out4_17",
+        "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27",
+        "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9",
+        "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28",
+        "/model/layers.9/post_attention_layernorm/output_3.out4_18",
+        "/model/layers.9/post_attention_layernorm/output_0.out4_18",
+        "/model/layers.9/mlp/Mul/output_0.out3_9",
+        "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29",
+        "/model/layers.10/input_layernorm/output_3.out4_19",
+        "/model/layers.10/input_layernorm/output_0.out4_19",
+        "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30",
+        "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10",
+        "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31",
+        "/model/layers.10/post_attention_layernorm/output_3.out4_20",
+        "/model/layers.10/post_attention_layernorm/output_0.out4_20",
+        "/model/layers.10/mlp/Mul/output_0.out3_10",
+        "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32",
+        "/model/layers.11/input_layernorm/output_3.out4_21",
+        "/model/layers.11/input_layernorm/output_0.out4_21",
+        "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33",
+        "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11",
+        "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34",
+        "/model/layers.11/post_attention_layernorm/output_3.out4_22",
+        "/model/layers.11/post_attention_layernorm/output_0.out4_22",
+        "/model/layers.11/mlp/Mul/output_0.out3_11",
+        "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35",
+        "/model/layers.12/input_layernorm/output_3.out4_23",
+        "/model/layers.12/input_layernorm/output_0.out4_23",
+        "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36",
+        "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12",
+        "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37",
+        "/model/layers.12/post_attention_layernorm/output_3.out4_24",
+        "/model/layers.12/post_attention_layernorm/output_0.out4_24",
+        "/model/layers.12/mlp/Mul/output_0.out3_12",
+        "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38",
+        "/model/layers.13/input_layernorm/output_3.out4_25",
+        "/model/layers.13/input_layernorm/output_0.out4_25",
+        "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39",
+        "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13",
+        "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40",
+        "/model/layers.13/post_attention_layernorm/output_3.out4_26",
+        "/model/layers.13/post_attention_layernorm/output_0.out4_26",
+        "/model/layers.13/mlp/Mul/output_0.out3_13",
+        "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41",
+        "/model/layers.14/input_layernorm/output_3.out4_27",
+        "/model/layers.14/input_layernorm/output_0.out4_27",
+        "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42",
+        "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14",
+        "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43",
+        "/model/layers.14/post_attention_layernorm/output_3.out4_28",
+        "/model/layers.14/post_attention_layernorm/output_0.out4_28",
+        "/model/layers.14/mlp/Mul/output_0.out3_14",
+        "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44",
+        "/model/layers.15/input_layernorm/output_3.out4_29",
+        "/model/layers.15/input_layernorm/output_0.out4_29",
+        "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45",
+        "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15",
+        "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46",
+        "/model/layers.15/post_attention_layernorm/output_3.out4_30",
+        "/model/layers.15/post_attention_layernorm/output_0.out4_30",
+        "/model/layers.15/mlp/Mul/output_0.out3_15",
+        "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47",
+        "/model/layers.16/input_layernorm/output_3.out4_31",
+        "/model/layers.16/input_layernorm/output_0.out4_31",
+        "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48",
+        "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16",
+        "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49",
+        "/model/layers.16/post_attention_layernorm/output_3.out4_32",
+        "/model/layers.16/post_attention_layernorm/output_0.out4_32",
+        "/model/layers.16/mlp/Mul/output_0.out3_16",
+        "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50",
+        "/model/layers.17/input_layernorm/output_3.out4_33",
+        "/model/layers.17/input_layernorm/output_0.out4_33",
+        "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51",
+        "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17",
+        "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52",
+        "/model/layers.17/post_attention_layernorm/output_3.out4_34",
+        "/model/layers.17/post_attention_layernorm/output_0.out4_34",
+        "/model/layers.17/mlp/Mul/output_0.out3_17",
+        "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53",
+        "/model/layers.18/input_layernorm/output_3.out4_35",
+        "/model/layers.18/input_layernorm/output_0.out4_35",
+        "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54",
+        "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18",
+        "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55",
+        "/model/layers.18/post_attention_layernorm/output_3.out4_36",
+        "/model/layers.18/post_attention_layernorm/output_0.out4_36",
+        "/model/layers.18/mlp/Mul/output_0.out3_18",
+        "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56",
+        "/model/layers.19/input_layernorm/output_3.out4_37",
+        "/model/layers.19/input_layernorm/output_0.out4_37",
+        "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57",
+        "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19",
+        "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58",
+        "/model/layers.19/post_attention_layernorm/output_3.out4_38",
+        "/model/layers.19/post_attention_layernorm/output_0.out4_38",
+        "/model/layers.19/mlp/Mul/output_0.out3_19",
+        "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59",
+        "/model/layers.20/input_layernorm/output_3.out4_39",
+        "/model/layers.20/input_layernorm/output_0.out4_39",
+        "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60",
+        "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20",
+        "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61",
+        "/model/layers.20/post_attention_layernorm/output_3.out4_40",
+        "/model/layers.20/post_attention_layernorm/output_0.out4_40",
+        "/model/layers.20/mlp/Mul/output_0.out3_20",
+        "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62",
+        "/model/layers.21/input_layernorm/output_3.out4_41",
+        "/model/layers.21/input_layernorm/output_0.out4_41",
+        "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63",
+        "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21",
+        "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64",
+        "/model/layers.21/post_attention_layernorm/output_3.out4_42",
+        "/model/layers.21/post_attention_layernorm/output_0.out4_42",
+        "/model/layers.21/mlp/Mul/output_0.out3_21",
+        "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65",
+        "/model/layers.22/input_layernorm/output_3.out4_43",
+        "/model/layers.22/input_layernorm/output_0.out4_43",
+        "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66",
+        "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22",
+        "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67",
+        "/model/layers.22/post_attention_layernorm/output_3.out4_44",
+        "/model/layers.22/post_attention_layernorm/output_0.out4_44",
+        "/model/layers.22/mlp/Mul/output_0.out3_22",
+        "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68",
+        "/model/layers.23/input_layernorm/output_3.out4_45",
+        "/model/layers.23/input_layernorm/output_0.out4_45",
+        "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69",
+        "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23",
+        "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70",
+        "/model/layers.23/post_attention_layernorm/output_3.out4_46",
+        "/model/layers.23/post_attention_layernorm/output_0.out4_46",
+        "/model/layers.23/mlp/Mul/output_0.out3_23",
+        "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71",
+        "/model/layers.24/input_layernorm/output_3.out4_47",
+        "/model/layers.24/input_layernorm/output_0.out4_47",
+        "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72",
+        "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24",
+        "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73",
+        "/model/layers.24/post_attention_layernorm/output_3.out4_48",
+        "/model/layers.24/post_attention_layernorm/output_0.out4_48",
+        "/model/layers.24/mlp/Mul/output_0.out3_24",
+        "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74",
+        "/model/layers.25/input_layernorm/output_3.out4_49",
+        "/model/layers.25/input_layernorm/output_0.out4_49",
+        "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75",
+        "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25",
+        "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76",
+        "/model/layers.25/post_attention_layernorm/output_3.out4_50",
+        "/model/layers.25/post_attention_layernorm/output_0.out4_50",
+        "/model/layers.25/mlp/Mul/output_0.out3_25",
+        "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77",
+        "/model/layers.26/input_layernorm/output_3.out4_51",
+        "/model/layers.26/input_layernorm/output_0.out4_51",
+        "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78",
+        "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26",
+        "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79",
+        "/model/layers.26/post_attention_layernorm/output_3.out4_52",
+        "/model/layers.26/post_attention_layernorm/output_0.out4_52",
+        "/model/layers.26/mlp/Mul/output_0.out3_26",
+        "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80",
+        "/model/layers.27/input_layernorm/output_3.out4_53",
+        "/model/layers.27/input_layernorm/output_0.out4_53",
+        "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81",
+        "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27",
+        "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82",
+        "/model/layers.27/post_attention_layernorm/output_3.out4_54",
+        "/model/layers.27/post_attention_layernorm/output_0.out4_54",
+        "/model/layers.27/mlp/Mul/output_0.out3_27",
+        "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83",
+        "/model/layers.28/input_layernorm/output_3.out4_55",
+        "/model/layers.28/input_layernorm/output_0.out4_55",
+        "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84",
+        "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28",
+        "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85",
+        "/model/layers.28/post_attention_layernorm/output_3.out4_56",
+        "/model/layers.28/post_attention_layernorm/output_0.out4_56",
+        "/model/layers.28/mlp/Mul/output_0.out3_28",
+        "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86",
+        "/model/layers.29/input_layernorm/output_3.out4_57",
+        "/model/layers.29/input_layernorm/output_0.out4_57",
+        "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87",
+        "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29",
+        "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88",
+        "/model/layers.29/post_attention_layernorm/output_3.out4_58",
+        "/model/layers.29/post_attention_layernorm/output_0.out4_58",
+        "/model/layers.29/mlp/Mul/output_0.out3_29",
+        "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89",
+        "/model/layers.30/input_layernorm/output_3.out4_59",
+        "/model/layers.30/input_layernorm/output_0.out4_59",
+        "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90",
+        "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30",
+        "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91",
+        "/model/layers.30/post_attention_layernorm/output_3.out4_60",
+        "/model/layers.30/post_attention_layernorm/output_0.out4_60",
+        "/model/layers.30/mlp/Mul/output_0.out3_30",
+        "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92",
+        "/model/layers.31/input_layernorm/output_3.out4_61",
+        "/model/layers.31/input_layernorm/output_0.out4_61",
+        "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93",
+        "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31",
+        "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94",
+        "/model/layers.31/post_attention_layernorm/output_3.out4_62",
+        "/model/layers.31/post_attention_layernorm/output_0.out4_62",
+        "/model/layers.31/mlp/Mul/output_0.out3_31",
+        "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95",
+        "/model/layers.32/final_norm_layernorm/output_0.out4_63"
+      ]
+    },
+    "const": {
+      "buffer_size": 5503844352,
+      "xrt_arg_id": 3,
+      "packed_tensors": [
+        "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.0.post_attention_layernorm.weight.bf",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.0.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.0.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.0.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.input_layernorm.weight.bf",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.1.post_attention_layernorm.weight.bf",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.1.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.1.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.1.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.input_layernorm.weight.bf",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.2.post_attention_layernorm.weight.bf",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.2.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.2.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.2.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.input_layernorm.weight.bf",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.3.post_attention_layernorm.weight.bf",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.3.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.3.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.3.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.input_layernorm.weight.bf",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.4.post_attention_layernorm.weight.bf",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.4.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.4.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.4.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.input_layernorm.weight.bf",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.5.post_attention_layernorm.weight.bf",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.5.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.5.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.5.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.input_layernorm.weight.bf",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.6.post_attention_layernorm.weight.bf",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.6.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.6.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.6.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.input_layernorm.weight.bf",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.7.post_attention_layernorm.weight.bf",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.7.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.7.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.7.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.input_layernorm.weight.bf",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.8.post_attention_layernorm.weight.bf",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.8.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.8.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.8.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.input_layernorm.weight.bf",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.9.post_attention_layernorm.weight.bf",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.9.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.9.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.9.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.input_layernorm.weight.bf",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.10.post_attention_layernorm.weight.bf",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.10.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.10.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.10.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.input_layernorm.weight.bf",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.11.post_attention_layernorm.weight.bf",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.11.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.11.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.11.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.input_layernorm.weight.bf",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.12.post_attention_layernorm.weight.bf",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.12.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.12.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.12.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.input_layernorm.weight.bf",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.13.post_attention_layernorm.weight.bf",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.13.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.13.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.13.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.input_layernorm.weight.bf",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.14.post_attention_layernorm.weight.bf",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.14.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.14.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.14.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.input_layernorm.weight.bf",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.15.post_attention_layernorm.weight.bf",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.15.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.15.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.15.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.input_layernorm.weight.bf",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.16.post_attention_layernorm.weight.bf",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.16.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.16.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.16.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.input_layernorm.weight.bf",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.17.post_attention_layernorm.weight.bf",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.17.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.17.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.17.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.input_layernorm.weight.bf",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.18.post_attention_layernorm.weight.bf",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.18.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.18.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.18.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.input_layernorm.weight.bf",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.19.post_attention_layernorm.weight.bf",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.19.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.19.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.19.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.input_layernorm.weight.bf",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.20.post_attention_layernorm.weight.bf",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.20.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.20.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.20.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.input_layernorm.weight.bf",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.21.post_attention_layernorm.weight.bf",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.21.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.21.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.21.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.input_layernorm.weight.bf",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.22.post_attention_layernorm.weight.bf",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.22.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.22.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.22.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.input_layernorm.weight.bf",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.23.post_attention_layernorm.weight.bf",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.23.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.23.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.23.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.input_layernorm.weight.bf",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.24.post_attention_layernorm.weight.bf",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.24.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.24.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.24.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.input_layernorm.weight.bf",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.25.post_attention_layernorm.weight.bf",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.25.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.25.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.25.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.input_layernorm.weight.bf",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.26.post_attention_layernorm.weight.bf",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.26.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.26.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.26.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.input_layernorm.weight.bf",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.27.post_attention_layernorm.weight.bf",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.27.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.27.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.27.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.input_layernorm.weight.bf",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.28.post_attention_layernorm.weight.bf",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.28.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.28.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.28.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.input_layernorm.weight.bf",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.29.post_attention_layernorm.weight.bf",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.29.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.29.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.29.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.input_layernorm.weight.bf",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.30.post_attention_layernorm.weight.bf",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.30.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.30.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.30.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.input_layernorm.weight.bf",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.31.post_attention_layernorm.weight.bf",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qweight",
+        "model.layers.31.mlp.up_proj.MatMulNBits.scales.f",
+        "model.layers.31.mlp.up_proj.MatMulNBits.qzeros",
+        "model.layers.31.mlp.up_proj.MatMulNBits.bias.f",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat",
+        "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat",
+        "model.layers.32.final_norm_layernorm.weight.bf",
+        "lm_head.MatMulNBits.qweight.preformat",
+        "lm_head.MatMulNBits.bias.preformat",
+        "lm_head.MatMulNBits.scales.preformat",
+        "lm_head.MatMulNBits.qzeros.preformat"
+      ]
+    },
+    "super_instr": {
+      "buffer_size": 0,
+      "xrt_arg_id": 4,
+      "packed_tensors": []
+    },
+    "ext_buf_0": {
+      "buffer_size": 536870912,
+      "xrt_arg_id": 5,
+      "packed_tensors": [
+        "past_key_values.0.key",
+        "past_key_values.0.value",
+        "present.0.key",
+        "present.0.value",
+        "past_key_values.1.key",
+        "past_key_values.1.value",
+        "present.1.key",
+        "present.1.value",
+        "past_key_values.2.key",
+        "past_key_values.2.value",
+        "present.2.key",
+        "present.2.value",
+        "past_key_values.3.key",
+        "past_key_values.3.value",
+        "present.3.key",
+        "present.3.value",
+        "past_key_values.4.key",
+        "past_key_values.4.value",
+        "present.4.key",
+        "present.4.value",
+        "past_key_values.5.key",
+        "past_key_values.5.value",
+        "present.5.key",
+        "present.5.value",
+        "past_key_values.6.key",
+        "past_key_values.6.value",
+        "present.6.key",
+        "present.6.value",
+        "past_key_values.7.key",
+        "past_key_values.7.value",
+        "present.7.key",
+        "present.7.value",
+        "past_key_values.8.key",
+        "past_key_values.8.value",
+        "present.8.key",
+        "present.8.value",
+        "past_key_values.9.key",
+        "past_key_values.9.value",
+        "present.9.key",
+        "present.9.value",
+        "past_key_values.10.key",
+        "past_key_values.10.value",
+        "present.10.key",
+        "present.10.value",
+        "past_key_values.11.key",
+        "past_key_values.11.value",
+        "present.11.key",
+        "present.11.value",
+        "past_key_values.12.key",
+        "past_key_values.12.value",
+        "present.12.key",
+        "present.12.value",
+        "past_key_values.13.key",
+        "past_key_values.13.value",
+        "present.13.key",
+        "present.13.value",
+        "past_key_values.14.key",
+        "past_key_values.14.value",
+        "present.14.key",
+        "present.14.value",
+        "past_key_values.15.key",
+        "past_key_values.15.value",
+        "present.15.key",
+        "present.15.value",
+        "past_key_values.16.key",
+        "past_key_values.16.value",
+        "present.16.key",
+        "present.16.value",
+        "past_key_values.17.key",
+        "past_key_values.17.value",
+        "present.17.key",
+        "present.17.value",
+        "past_key_values.18.key",
+        "past_key_values.18.value",
+        "present.18.key",
+        "present.18.value",
+        "past_key_values.19.key",
+        "past_key_values.19.value",
+        "present.19.key",
+        "present.19.value",
+        "past_key_values.20.key",
+        "past_key_values.20.value",
+        "present.20.key",
+        "present.20.value",
+        "past_key_values.21.key",
+        "past_key_values.21.value",
+        "present.21.key",
+        "present.21.value",
+        "past_key_values.22.key",
+        "past_key_values.22.value",
+        "present.22.key",
+        "present.22.value",
+        "past_key_values.23.key",
+        "past_key_values.23.value",
+        "present.23.key",
+        "present.23.value",
+        "past_key_values.24.key",
+        "past_key_values.24.value",
+        "present.24.key",
+        "present.24.value",
+        "past_key_values.25.key",
+        "past_key_values.25.value",
+        "present.25.key",
+        "present.25.value",
+        "past_key_values.26.key",
+        "past_key_values.26.value",
+        "present.26.key",
+        "present.26.value",
+        "past_key_values.27.key",
+        "past_key_values.27.value",
+        "present.27.key",
+        "present.27.value",
+        "past_key_values.28.key",
+        "past_key_values.28.value",
+        "present.28.key",
+        "present.28.value",
+        "past_key_values.29.key",
+        "past_key_values.29.value",
+        "present.29.key",
+        "present.29.value",
+        "past_key_values.30.key",
+        "past_key_values.30.value",
+        "present.30.key",
+        "present.30.value",
+        "past_key_values.31.key",
+        "past_key_values.31.value",
+        "present.31.key",
+        "present.31.value"
+      ]
+    },
+    "ext_buf_1": {
+      "buffer_size": 8388608,
+      "xrt_arg_id": 6,
+      "packed_tensors": [
+        "sin_cos_cache_token"
+      ]
+    }
+  },
+  "tensor_map": {
+    "/model/layers.0/input_layernorm/output_0.out5_4_0": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 8192
+    },
+    "attention_mask_const_uint": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "uint32",
+      "shape": [
+        1
+      ],
+      "size_in_bytes": 4,
+      "op_tensor_size": 4,
+      "offset": 24700
+    },
+    "/model/embed_tokens/Gather/output_0.out4_0": {
+      "packed_buffer_label": "in",
+      "xrt_arg_id": 0,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 16388
+    },
+    "/model/layers.32/final_norm_layernorm/output_0.dummy": {
+      "packed_buffer_label": "out",
+      "xrt_arg_id": 1,
+      "dtype": "float16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 0
+    },
+    "logits.out5_4_96": {
+      "packed_buffer_label": "out",
+      "xrt_arg_id": 1,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        32768
+      ],
+      "size_in_bytes": 65536,
+      "op_tensor_size": 65536,
+      "offset": 8192
+    },
+    "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 0
+    },
+    "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 10240
+    },
+    "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 18432
+    },
+    "/model/layers.0/post_attention_layernorm/output_3.out4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 26624
+    },
+    "/model/layers.0/post_attention_layernorm/output_0.out4_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 34816
+    },
+    "/model/layers.0/mlp/Mul/output_0.out3_0": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 43008
+    },
+    "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 71680
+    },
+    "/model/layers.1/input_layernorm/output_3.out4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 79872
+    },
+    "/model/layers.1/input_layernorm/output_0.out4_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 88064
+    },
+    "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 96256
+    },
+    "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 106496
+    },
+    "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 114688
+    },
+    "/model/layers.1/post_attention_layernorm/output_3.out4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 122880
+    },
+    "/model/layers.1/post_attention_layernorm/output_0.out4_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 131072
+    },
+    "/model/layers.1/mlp/Mul/output_0.out3_1": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 139264
+    },
+    "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 167936
+    },
+    "/model/layers.2/input_layernorm/output_3.out4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 176128
+    },
+    "/model/layers.2/input_layernorm/output_0.out4_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 184320
+    },
+    "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 192512
+    },
+    "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 202752
+    },
+    "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 210944
+    },
+    "/model/layers.2/post_attention_layernorm/output_3.out4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 219136
+    },
+    "/model/layers.2/post_attention_layernorm/output_0.out4_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 227328
+    },
+    "/model/layers.2/mlp/Mul/output_0.out3_2": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 235520
+    },
+    "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 264192
+    },
+    "/model/layers.3/input_layernorm/output_3.out4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 272384
+    },
+    "/model/layers.3/input_layernorm/output_0.out4_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 280576
+    },
+    "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 288768
+    },
+    "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 299008
+    },
+    "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 307200
+    },
+    "/model/layers.3/post_attention_layernorm/output_3.out4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 315392
+    },
+    "/model/layers.3/post_attention_layernorm/output_0.out4_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 323584
+    },
+    "/model/layers.3/mlp/Mul/output_0.out3_3": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 331776
+    },
+    "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 360448
+    },
+    "/model/layers.4/input_layernorm/output_3.out4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 368640
+    },
+    "/model/layers.4/input_layernorm/output_0.out4_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 376832
+    },
+    "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 385024
+    },
+    "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 395264
+    },
+    "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 403456
+    },
+    "/model/layers.4/post_attention_layernorm/output_3.out4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 411648
+    },
+    "/model/layers.4/post_attention_layernorm/output_0.out4_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 419840
+    },
+    "/model/layers.4/mlp/Mul/output_0.out3_4": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 428032
+    },
+    "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 456704
+    },
+    "/model/layers.5/input_layernorm/output_3.out4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 464896
+    },
+    "/model/layers.5/input_layernorm/output_0.out4_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 473088
+    },
+    "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 481280
+    },
+    "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 491520
+    },
+    "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 499712
+    },
+    "/model/layers.5/post_attention_layernorm/output_3.out4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 507904
+    },
+    "/model/layers.5/post_attention_layernorm/output_0.out4_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 516096
+    },
+    "/model/layers.5/mlp/Mul/output_0.out3_5": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 524288
+    },
+    "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 552960
+    },
+    "/model/layers.6/input_layernorm/output_3.out4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 561152
+    },
+    "/model/layers.6/input_layernorm/output_0.out4_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 569344
+    },
+    "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 577536
+    },
+    "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 587776
+    },
+    "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 595968
+    },
+    "/model/layers.6/post_attention_layernorm/output_3.out4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 604160
+    },
+    "/model/layers.6/post_attention_layernorm/output_0.out4_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 612352
+    },
+    "/model/layers.6/mlp/Mul/output_0.out3_6": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 620544
+    },
+    "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 649216
+    },
+    "/model/layers.7/input_layernorm/output_3.out4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 657408
+    },
+    "/model/layers.7/input_layernorm/output_0.out4_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 665600
+    },
+    "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 673792
+    },
+    "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 684032
+    },
+    "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 692224
+    },
+    "/model/layers.7/post_attention_layernorm/output_3.out4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 700416
+    },
+    "/model/layers.7/post_attention_layernorm/output_0.out4_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 708608
+    },
+    "/model/layers.7/mlp/Mul/output_0.out3_7": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 716800
+    },
+    "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 745472
+    },
+    "/model/layers.8/input_layernorm/output_3.out4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 753664
+    },
+    "/model/layers.8/input_layernorm/output_0.out4_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 761856
+    },
+    "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 770048
+    },
+    "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 780288
+    },
+    "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 788480
+    },
+    "/model/layers.8/post_attention_layernorm/output_3.out4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 796672
+    },
+    "/model/layers.8/post_attention_layernorm/output_0.out4_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 804864
+    },
+    "/model/layers.8/mlp/Mul/output_0.out3_8": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 813056
+    },
+    "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 841728
+    },
+    "/model/layers.9/input_layernorm/output_3.out4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 849920
+    },
+    "/model/layers.9/input_layernorm/output_0.out4_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 858112
+    },
+    "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 866304
+    },
+    "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 876544
+    },
+    "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 884736
+    },
+    "/model/layers.9/post_attention_layernorm/output_3.out4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 892928
+    },
+    "/model/layers.9/post_attention_layernorm/output_0.out4_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 901120
+    },
+    "/model/layers.9/mlp/Mul/output_0.out3_9": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 909312
+    },
+    "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 937984
+    },
+    "/model/layers.10/input_layernorm/output_3.out4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 946176
+    },
+    "/model/layers.10/input_layernorm/output_0.out4_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 954368
+    },
+    "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 962560
+    },
+    "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 972800
+    },
+    "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 980992
+    },
+    "/model/layers.10/post_attention_layernorm/output_3.out4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 989184
+    },
+    "/model/layers.10/post_attention_layernorm/output_0.out4_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 997376
+    },
+    "/model/layers.10/mlp/Mul/output_0.out3_10": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1005568
+    },
+    "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1034240
+    },
+    "/model/layers.11/input_layernorm/output_3.out4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1042432
+    },
+    "/model/layers.11/input_layernorm/output_0.out4_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1050624
+    },
+    "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1058816
+    },
+    "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1069056
+    },
+    "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1077248
+    },
+    "/model/layers.11/post_attention_layernorm/output_3.out4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1085440
+    },
+    "/model/layers.11/post_attention_layernorm/output_0.out4_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1093632
+    },
+    "/model/layers.11/mlp/Mul/output_0.out3_11": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1101824
+    },
+    "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1130496
+    },
+    "/model/layers.12/input_layernorm/output_3.out4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1138688
+    },
+    "/model/layers.12/input_layernorm/output_0.out4_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1146880
+    },
+    "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1155072
+    },
+    "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1165312
+    },
+    "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1173504
+    },
+    "/model/layers.12/post_attention_layernorm/output_3.out4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1181696
+    },
+    "/model/layers.12/post_attention_layernorm/output_0.out4_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1189888
+    },
+    "/model/layers.12/mlp/Mul/output_0.out3_12": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1198080
+    },
+    "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1226752
+    },
+    "/model/layers.13/input_layernorm/output_3.out4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1234944
+    },
+    "/model/layers.13/input_layernorm/output_0.out4_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1243136
+    },
+    "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1251328
+    },
+    "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1261568
+    },
+    "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1269760
+    },
+    "/model/layers.13/post_attention_layernorm/output_3.out4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1277952
+    },
+    "/model/layers.13/post_attention_layernorm/output_0.out4_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1286144
+    },
+    "/model/layers.13/mlp/Mul/output_0.out3_13": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1294336
+    },
+    "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1323008
+    },
+    "/model/layers.14/input_layernorm/output_3.out4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1331200
+    },
+    "/model/layers.14/input_layernorm/output_0.out4_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1339392
+    },
+    "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1347584
+    },
+    "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1357824
+    },
+    "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1366016
+    },
+    "/model/layers.14/post_attention_layernorm/output_3.out4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1374208
+    },
+    "/model/layers.14/post_attention_layernorm/output_0.out4_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1382400
+    },
+    "/model/layers.14/mlp/Mul/output_0.out3_14": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1390592
+    },
+    "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1419264
+    },
+    "/model/layers.15/input_layernorm/output_3.out4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1427456
+    },
+    "/model/layers.15/input_layernorm/output_0.out4_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1435648
+    },
+    "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1443840
+    },
+    "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1454080
+    },
+    "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1462272
+    },
+    "/model/layers.15/post_attention_layernorm/output_3.out4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1470464
+    },
+    "/model/layers.15/post_attention_layernorm/output_0.out4_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1478656
+    },
+    "/model/layers.15/mlp/Mul/output_0.out3_15": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1486848
+    },
+    "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1515520
+    },
+    "/model/layers.16/input_layernorm/output_3.out4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1523712
+    },
+    "/model/layers.16/input_layernorm/output_0.out4_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1531904
+    },
+    "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1540096
+    },
+    "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1550336
+    },
+    "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1558528
+    },
+    "/model/layers.16/post_attention_layernorm/output_3.out4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1566720
+    },
+    "/model/layers.16/post_attention_layernorm/output_0.out4_32": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1574912
+    },
+    "/model/layers.16/mlp/Mul/output_0.out3_16": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1583104
+    },
+    "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1611776
+    },
+    "/model/layers.17/input_layernorm/output_3.out4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1619968
+    },
+    "/model/layers.17/input_layernorm/output_0.out4_33": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1628160
+    },
+    "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1636352
+    },
+    "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1646592
+    },
+    "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1654784
+    },
+    "/model/layers.17/post_attention_layernorm/output_3.out4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1662976
+    },
+    "/model/layers.17/post_attention_layernorm/output_0.out4_34": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1671168
+    },
+    "/model/layers.17/mlp/Mul/output_0.out3_17": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1679360
+    },
+    "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1708032
+    },
+    "/model/layers.18/input_layernorm/output_3.out4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1716224
+    },
+    "/model/layers.18/input_layernorm/output_0.out4_35": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1724416
+    },
+    "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1732608
+    },
+    "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1742848
+    },
+    "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1751040
+    },
+    "/model/layers.18/post_attention_layernorm/output_3.out4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1759232
+    },
+    "/model/layers.18/post_attention_layernorm/output_0.out4_36": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1767424
+    },
+    "/model/layers.18/mlp/Mul/output_0.out3_18": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1775616
+    },
+    "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1804288
+    },
+    "/model/layers.19/input_layernorm/output_3.out4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1812480
+    },
+    "/model/layers.19/input_layernorm/output_0.out4_37": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1820672
+    },
+    "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1828864
+    },
+    "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1839104
+    },
+    "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1847296
+    },
+    "/model/layers.19/post_attention_layernorm/output_3.out4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1855488
+    },
+    "/model/layers.19/post_attention_layernorm/output_0.out4_38": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1863680
+    },
+    "/model/layers.19/mlp/Mul/output_0.out3_19": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1871872
+    },
+    "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1900544
+    },
+    "/model/layers.20/input_layernorm/output_3.out4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1908736
+    },
+    "/model/layers.20/input_layernorm/output_0.out4_39": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1916928
+    },
+    "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 1925120
+    },
+    "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1935360
+    },
+    "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1943552
+    },
+    "/model/layers.20/post_attention_layernorm/output_3.out4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1951744
+    },
+    "/model/layers.20/post_attention_layernorm/output_0.out4_40": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1959936
+    },
+    "/model/layers.20/mlp/Mul/output_0.out3_20": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 1968128
+    },
+    "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1996800
+    },
+    "/model/layers.21/input_layernorm/output_3.out4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2004992
+    },
+    "/model/layers.21/input_layernorm/output_0.out4_41": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2013184
+    },
+    "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2021376
+    },
+    "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2031616
+    },
+    "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2039808
+    },
+    "/model/layers.21/post_attention_layernorm/output_3.out4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2048000
+    },
+    "/model/layers.21/post_attention_layernorm/output_0.out4_42": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2056192
+    },
+    "/model/layers.21/mlp/Mul/output_0.out3_21": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2064384
+    },
+    "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2093056
+    },
+    "/model/layers.22/input_layernorm/output_3.out4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2101248
+    },
+    "/model/layers.22/input_layernorm/output_0.out4_43": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2109440
+    },
+    "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2117632
+    },
+    "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2127872
+    },
+    "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2136064
+    },
+    "/model/layers.22/post_attention_layernorm/output_3.out4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2144256
+    },
+    "/model/layers.22/post_attention_layernorm/output_0.out4_44": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2152448
+    },
+    "/model/layers.22/mlp/Mul/output_0.out3_22": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2160640
+    },
+    "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2189312
+    },
+    "/model/layers.23/input_layernorm/output_3.out4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2197504
+    },
+    "/model/layers.23/input_layernorm/output_0.out4_45": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2205696
+    },
+    "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2213888
+    },
+    "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2224128
+    },
+    "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2232320
+    },
+    "/model/layers.23/post_attention_layernorm/output_3.out4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2240512
+    },
+    "/model/layers.23/post_attention_layernorm/output_0.out4_46": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2248704
+    },
+    "/model/layers.23/mlp/Mul/output_0.out3_23": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2256896
+    },
+    "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2285568
+    },
+    "/model/layers.24/input_layernorm/output_3.out4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2293760
+    },
+    "/model/layers.24/input_layernorm/output_0.out4_47": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2301952
+    },
+    "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2310144
+    },
+    "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2320384
+    },
+    "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2328576
+    },
+    "/model/layers.24/post_attention_layernorm/output_3.out4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2336768
+    },
+    "/model/layers.24/post_attention_layernorm/output_0.out4_48": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2344960
+    },
+    "/model/layers.24/mlp/Mul/output_0.out3_24": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2353152
+    },
+    "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2381824
+    },
+    "/model/layers.25/input_layernorm/output_3.out4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2390016
+    },
+    "/model/layers.25/input_layernorm/output_0.out4_49": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2398208
+    },
+    "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2406400
+    },
+    "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2416640
+    },
+    "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2424832
+    },
+    "/model/layers.25/post_attention_layernorm/output_3.out4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2433024
+    },
+    "/model/layers.25/post_attention_layernorm/output_0.out4_50": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2441216
+    },
+    "/model/layers.25/mlp/Mul/output_0.out3_25": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2449408
+    },
+    "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2478080
+    },
+    "/model/layers.26/input_layernorm/output_3.out4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2486272
+    },
+    "/model/layers.26/input_layernorm/output_0.out4_51": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2494464
+    },
+    "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2502656
+    },
+    "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2512896
+    },
+    "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2521088
+    },
+    "/model/layers.26/post_attention_layernorm/output_3.out4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2529280
+    },
+    "/model/layers.26/post_attention_layernorm/output_0.out4_52": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2537472
+    },
+    "/model/layers.26/mlp/Mul/output_0.out3_26": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2545664
+    },
+    "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2574336
+    },
+    "/model/layers.27/input_layernorm/output_3.out4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2582528
+    },
+    "/model/layers.27/input_layernorm/output_0.out4_53": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2590720
+    },
+    "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2598912
+    },
+    "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2609152
+    },
+    "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2617344
+    },
+    "/model/layers.27/post_attention_layernorm/output_3.out4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2625536
+    },
+    "/model/layers.27/post_attention_layernorm/output_0.out4_54": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2633728
+    },
+    "/model/layers.27/mlp/Mul/output_0.out3_27": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2641920
+    },
+    "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2670592
+    },
+    "/model/layers.28/input_layernorm/output_3.out4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2678784
+    },
+    "/model/layers.28/input_layernorm/output_0.out4_55": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2686976
+    },
+    "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2695168
+    },
+    "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2705408
+    },
+    "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2713600
+    },
+    "/model/layers.28/post_attention_layernorm/output_3.out4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2721792
+    },
+    "/model/layers.28/post_attention_layernorm/output_0.out4_56": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2729984
+    },
+    "/model/layers.28/mlp/Mul/output_0.out3_28": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2738176
+    },
+    "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2766848
+    },
+    "/model/layers.29/input_layernorm/output_3.out4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2775040
+    },
+    "/model/layers.29/input_layernorm/output_0.out4_57": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2783232
+    },
+    "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2791424
+    },
+    "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2801664
+    },
+    "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2809856
+    },
+    "/model/layers.29/post_attention_layernorm/output_3.out4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2818048
+    },
+    "/model/layers.29/post_attention_layernorm/output_0.out4_58": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2826240
+    },
+    "/model/layers.29/mlp/Mul/output_0.out3_29": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2834432
+    },
+    "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2863104
+    },
+    "/model/layers.30/input_layernorm/output_3.out4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2871296
+    },
+    "/model/layers.30/input_layernorm/output_0.out4_59": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2879488
+    },
+    "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2887680
+    },
+    "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2897920
+    },
+    "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2906112
+    },
+    "/model/layers.30/post_attention_layernorm/output_3.out4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2914304
+    },
+    "/model/layers.30/post_attention_layernorm/output_0.out4_60": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2922496
+    },
+    "/model/layers.30/mlp/Mul/output_0.out3_30": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 2930688
+    },
+    "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2959360
+    },
+    "/model/layers.31/input_layernorm/output_3.out4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2967552
+    },
+    "/model/layers.31/input_layernorm/output_0.out4_61": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2975744
+    },
+    "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        5120
+      ],
+      "size_in_bytes": 10240,
+      "op_tensor_size": 10240,
+      "offset": 2983936
+    },
+    "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2994176
+    },
+    "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3002368
+    },
+    "/model/layers.31/post_attention_layernorm/output_3.out4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3010560
+    },
+    "/model/layers.31/post_attention_layernorm/output_0.out4_62": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3018752
+    },
+    "/model/layers.31/mlp/Mul/output_0.out3_31": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        14336
+      ],
+      "size_in_bytes": 28672,
+      "op_tensor_size": 28672,
+      "offset": 3026944
+    },
+    "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3055616
+    },
+    "/model/layers.32/final_norm_layernorm/output_0.out4_63": {
+      "packed_buffer_label": "scratch",
+      "xrt_arg_id": 2,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        1,
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3063808
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 0,
+      "file_name": ".cache\\MatMulNBits_2_0_0.const",
+      "file_size": 20971520
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 20971520,
+      "file_name": ".cache\\MatMulNBits_2_0_1.const",
+      "file_size": 20480
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 20992000,
+      "file_name": ".cache\\MatMulNBits_2_0_2.const",
+      "file_size": 655360
+    },
+    "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 21647360,
+      "file_name": ".cache\\MatMulNBits_2_0_3.const",
+      "file_size": 163840
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 21811200,
+      "file_name": ".cache\\MatMulNBits_2_0_4.const",
+      "file_size": 4194304
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 26005504,
+      "file_name": ".cache\\MatMulNBits_2_0_5.const",
+      "file_size": 4096
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 26009600,
+      "file_name": ".cache\\MatMulNBits_2_0_6.const",
+      "file_size": 131072
+    },
+    "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 26140672,
+      "file_name": ".cache\\MatMulNBits_2_0_7.const",
+      "file_size": 32768
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 26173440,
+      "file_name": ".cache\\MatMulNBits_2_0_8.const",
+      "file_size": 16777216
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 42950656,
+      "file_name": ".cache\\MatMulNBits_2_0_9.const",
+      "file_size": 16384
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 42967040,
+      "file_name": ".cache\\MatMulNBits_2_0_10.const",
+      "file_size": 524288
+    },
+    "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 43491328,
+      "file_name": ".cache\\MatMulNBits_2_0_11.const",
+      "file_size": 131072
+    },
+    "model.layers.0.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 43622400,
+      "file_name": ".cache\\MatMulNBits_2_0_12.const",
+      "file_size": 8192
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 43630592,
+      "file_name": ".cache\\MatMulNBits_2_0_13.const",
+      "file_size": 29360128
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 72990720,
+      "file_name": ".cache\\MatMulNBits_2_0_14.const",
+      "file_size": 1835008
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 74825728,
+      "file_name": ".cache\\MatMulNBits_2_0_15.const",
+      "file_size": 229376
+    },
+    "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 75055104,
+      "file_name": ".cache\\MatMulNBits_2_0_16.const",
+      "file_size": 57344
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 75112448,
+      "file_name": ".cache\\MatMulNBits_2_0_17.const",
+      "file_size": 29360128
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 104472576,
+      "file_name": ".cache\\MatMulNBits_2_0_18.const",
+      "file_size": 1835008
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 106307584,
+      "file_name": ".cache\\MatMulNBits_2_0_19.const",
+      "file_size": 229376
+    },
+    "model.layers.0.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 106536960,
+      "file_name": ".cache\\MatMulNBits_2_0_20.const",
+      "file_size": 57344
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 106594304,
+      "file_name": ".cache\\MatMulNBits_2_0_21.const",
+      "file_size": 58720256
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 165314560,
+      "file_name": ".cache\\MatMulNBits_2_0_22.const",
+      "file_size": 16384
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 165330944,
+      "file_name": ".cache\\MatMulNBits_2_0_23.const",
+      "file_size": 1835008
+    },
+    "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 167165952,
+      "file_name": ".cache\\MatMulNBits_2_0_24.const",
+      "file_size": 458752
+    },
+    "model.layers.1.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 167624704,
+      "file_name": ".cache\\MatMulNBits_2_0_25.const",
+      "file_size": 8192
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 167632896,
+      "file_name": ".cache\\MatMulNBits_2_0_26.const",
+      "file_size": 20971520
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 188604416,
+      "file_name": ".cache\\MatMulNBits_2_0_27.const",
+      "file_size": 20480
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 188624896,
+      "file_name": ".cache\\MatMulNBits_2_0_28.const",
+      "file_size": 655360
+    },
+    "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 189280256,
+      "file_name": ".cache\\MatMulNBits_2_0_29.const",
+      "file_size": 163840
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 189444096,
+      "file_name": ".cache\\MatMulNBits_2_0_30.const",
+      "file_size": 4194304
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 193638400,
+      "file_name": ".cache\\MatMulNBits_2_0_31.const",
+      "file_size": 4096
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 193642496,
+      "file_name": ".cache\\MatMulNBits_2_0_32.const",
+      "file_size": 131072
+    },
+    "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 193773568,
+      "file_name": ".cache\\MatMulNBits_2_0_33.const",
+      "file_size": 32768
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 193806336,
+      "file_name": ".cache\\MatMulNBits_2_0_34.const",
+      "file_size": 16777216
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 210583552,
+      "file_name": ".cache\\MatMulNBits_2_0_35.const",
+      "file_size": 16384
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 210599936,
+      "file_name": ".cache\\MatMulNBits_2_0_36.const",
+      "file_size": 524288
+    },
+    "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 211124224,
+      "file_name": ".cache\\MatMulNBits_2_0_37.const",
+      "file_size": 131072
+    },
+    "model.layers.1.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 211255296,
+      "file_name": ".cache\\MatMulNBits_2_0_38.const",
+      "file_size": 8192
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 211263488,
+      "file_name": ".cache\\MatMulNBits_2_0_39.const",
+      "file_size": 29360128
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 240623616,
+      "file_name": ".cache\\MatMulNBits_2_0_40.const",
+      "file_size": 1835008
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 242458624,
+      "file_name": ".cache\\MatMulNBits_2_0_41.const",
+      "file_size": 229376
+    },
+    "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 242688000,
+      "file_name": ".cache\\MatMulNBits_2_0_42.const",
+      "file_size": 57344
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 242745344,
+      "file_name": ".cache\\MatMulNBits_2_0_43.const",
+      "file_size": 29360128
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 272105472,
+      "file_name": ".cache\\MatMulNBits_2_0_44.const",
+      "file_size": 1835008
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 273940480,
+      "file_name": ".cache\\MatMulNBits_2_0_45.const",
+      "file_size": 229376
+    },
+    "model.layers.1.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 274169856,
+      "file_name": ".cache\\MatMulNBits_2_0_46.const",
+      "file_size": 57344
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 274227200,
+      "file_name": ".cache\\MatMulNBits_2_0_47.const",
+      "file_size": 58720256
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 332947456,
+      "file_name": ".cache\\MatMulNBits_2_0_48.const",
+      "file_size": 16384
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 332963840,
+      "file_name": ".cache\\MatMulNBits_2_0_49.const",
+      "file_size": 1835008
+    },
+    "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 334798848,
+      "file_name": ".cache\\MatMulNBits_2_0_50.const",
+      "file_size": 458752
+    },
+    "model.layers.2.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 335257600,
+      "file_name": ".cache\\MatMulNBits_2_0_51.const",
+      "file_size": 8192
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 335265792,
+      "file_name": ".cache\\MatMulNBits_2_0_52.const",
+      "file_size": 20971520
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 356237312,
+      "file_name": ".cache\\MatMulNBits_2_0_53.const",
+      "file_size": 20480
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 356257792,
+      "file_name": ".cache\\MatMulNBits_2_0_54.const",
+      "file_size": 655360
+    },
+    "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 356913152,
+      "file_name": ".cache\\MatMulNBits_2_0_55.const",
+      "file_size": 163840
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 357076992,
+      "file_name": ".cache\\MatMulNBits_2_0_56.const",
+      "file_size": 4194304
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 361271296,
+      "file_name": ".cache\\MatMulNBits_2_0_57.const",
+      "file_size": 4096
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 361275392,
+      "file_name": ".cache\\MatMulNBits_2_0_58.const",
+      "file_size": 131072
+    },
+    "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 361406464,
+      "file_name": ".cache\\MatMulNBits_2_0_59.const",
+      "file_size": 32768
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 361439232,
+      "file_name": ".cache\\MatMulNBits_2_0_60.const",
+      "file_size": 16777216
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 378216448,
+      "file_name": ".cache\\MatMulNBits_2_0_61.const",
+      "file_size": 16384
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 378232832,
+      "file_name": ".cache\\MatMulNBits_2_0_62.const",
+      "file_size": 524288
+    },
+    "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 378757120,
+      "file_name": ".cache\\MatMulNBits_2_0_63.const",
+      "file_size": 131072
+    },
+    "model.layers.2.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 378888192,
+      "file_name": ".cache\\MatMulNBits_2_0_64.const",
+      "file_size": 8192
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 378896384,
+      "file_name": ".cache\\MatMulNBits_2_0_65.const",
+      "file_size": 29360128
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 408256512,
+      "file_name": ".cache\\MatMulNBits_2_0_66.const",
+      "file_size": 1835008
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 410091520,
+      "file_name": ".cache\\MatMulNBits_2_0_67.const",
+      "file_size": 229376
+    },
+    "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 410320896,
+      "file_name": ".cache\\MatMulNBits_2_0_68.const",
+      "file_size": 57344
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 410378240,
+      "file_name": ".cache\\MatMulNBits_2_0_69.const",
+      "file_size": 29360128
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 439738368,
+      "file_name": ".cache\\MatMulNBits_2_0_70.const",
+      "file_size": 1835008
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 441573376,
+      "file_name": ".cache\\MatMulNBits_2_0_71.const",
+      "file_size": 229376
+    },
+    "model.layers.2.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 441802752,
+      "file_name": ".cache\\MatMulNBits_2_0_72.const",
+      "file_size": 57344
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 441860096,
+      "file_name": ".cache\\MatMulNBits_2_0_73.const",
+      "file_size": 58720256
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 500580352,
+      "file_name": ".cache\\MatMulNBits_2_0_74.const",
+      "file_size": 16384
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 500596736,
+      "file_name": ".cache\\MatMulNBits_2_0_75.const",
+      "file_size": 1835008
+    },
+    "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 502431744,
+      "file_name": ".cache\\MatMulNBits_2_0_76.const",
+      "file_size": 458752
+    },
+    "model.layers.3.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 502890496,
+      "file_name": ".cache\\MatMulNBits_2_0_77.const",
+      "file_size": 8192
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 502898688,
+      "file_name": ".cache\\MatMulNBits_2_0_78.const",
+      "file_size": 20971520
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 523870208,
+      "file_name": ".cache\\MatMulNBits_2_0_79.const",
+      "file_size": 20480
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 523890688,
+      "file_name": ".cache\\MatMulNBits_2_0_80.const",
+      "file_size": 655360
+    },
+    "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 524546048,
+      "file_name": ".cache\\MatMulNBits_2_0_81.const",
+      "file_size": 163840
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 524709888,
+      "file_name": ".cache\\MatMulNBits_2_0_82.const",
+      "file_size": 4194304
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 528904192,
+      "file_name": ".cache\\MatMulNBits_2_0_83.const",
+      "file_size": 4096
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 528908288,
+      "file_name": ".cache\\MatMulNBits_2_0_84.const",
+      "file_size": 131072
+    },
+    "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 529039360,
+      "file_name": ".cache\\MatMulNBits_2_0_85.const",
+      "file_size": 32768
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 529072128,
+      "file_name": ".cache\\MatMulNBits_2_0_86.const",
+      "file_size": 16777216
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 545849344,
+      "file_name": ".cache\\MatMulNBits_2_0_87.const",
+      "file_size": 16384
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 545865728,
+      "file_name": ".cache\\MatMulNBits_2_0_88.const",
+      "file_size": 524288
+    },
+    "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 546390016,
+      "file_name": ".cache\\MatMulNBits_2_0_89.const",
+      "file_size": 131072
+    },
+    "model.layers.3.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 546521088,
+      "file_name": ".cache\\MatMulNBits_2_0_90.const",
+      "file_size": 8192
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 546529280,
+      "file_name": ".cache\\MatMulNBits_2_0_91.const",
+      "file_size": 29360128
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 575889408,
+      "file_name": ".cache\\MatMulNBits_2_0_92.const",
+      "file_size": 1835008
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 577724416,
+      "file_name": ".cache\\MatMulNBits_2_0_93.const",
+      "file_size": 229376
+    },
+    "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 577953792,
+      "file_name": ".cache\\MatMulNBits_2_0_94.const",
+      "file_size": 57344
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 578011136,
+      "file_name": ".cache\\MatMulNBits_2_0_95.const",
+      "file_size": 29360128
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 607371264,
+      "file_name": ".cache\\MatMulNBits_2_0_96.const",
+      "file_size": 1835008
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 609206272,
+      "file_name": ".cache\\MatMulNBits_2_0_97.const",
+      "file_size": 229376
+    },
+    "model.layers.3.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 609435648,
+      "file_name": ".cache\\MatMulNBits_2_0_98.const",
+      "file_size": 57344
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 609492992,
+      "file_name": ".cache\\MatMulNBits_2_0_99.const",
+      "file_size": 58720256
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 668213248,
+      "file_name": ".cache\\MatMulNBits_2_0_100.const",
+      "file_size": 16384
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 668229632,
+      "file_name": ".cache\\MatMulNBits_2_0_101.const",
+      "file_size": 1835008
+    },
+    "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 670064640,
+      "file_name": ".cache\\MatMulNBits_2_0_102.const",
+      "file_size": 458752
+    },
+    "model.layers.4.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 670523392,
+      "file_name": ".cache\\MatMulNBits_2_0_103.const",
+      "file_size": 8192
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 670531584,
+      "file_name": ".cache\\MatMulNBits_2_0_104.const",
+      "file_size": 20971520
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 691503104,
+      "file_name": ".cache\\MatMulNBits_2_0_105.const",
+      "file_size": 20480
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 691523584,
+      "file_name": ".cache\\MatMulNBits_2_0_106.const",
+      "file_size": 655360
+    },
+    "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 692178944,
+      "file_name": ".cache\\MatMulNBits_2_0_107.const",
+      "file_size": 163840
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 692342784,
+      "file_name": ".cache\\MatMulNBits_2_0_108.const",
+      "file_size": 4194304
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 696537088,
+      "file_name": ".cache\\MatMulNBits_2_0_109.const",
+      "file_size": 4096
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 696541184,
+      "file_name": ".cache\\MatMulNBits_2_0_110.const",
+      "file_size": 131072
+    },
+    "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 696672256,
+      "file_name": ".cache\\MatMulNBits_2_0_111.const",
+      "file_size": 32768
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 696705024,
+      "file_name": ".cache\\MatMulNBits_2_0_112.const",
+      "file_size": 16777216
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 713482240,
+      "file_name": ".cache\\MatMulNBits_2_0_113.const",
+      "file_size": 16384
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 713498624,
+      "file_name": ".cache\\MatMulNBits_2_0_114.const",
+      "file_size": 524288
+    },
+    "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 714022912,
+      "file_name": ".cache\\MatMulNBits_2_0_115.const",
+      "file_size": 131072
+    },
+    "model.layers.4.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 714153984,
+      "file_name": ".cache\\MatMulNBits_2_0_116.const",
+      "file_size": 8192
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 714162176,
+      "file_name": ".cache\\MatMulNBits_2_0_117.const",
+      "file_size": 29360128
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 743522304,
+      "file_name": ".cache\\MatMulNBits_2_0_118.const",
+      "file_size": 1835008
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 745357312,
+      "file_name": ".cache\\MatMulNBits_2_0_119.const",
+      "file_size": 229376
+    },
+    "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 745586688,
+      "file_name": ".cache\\MatMulNBits_2_0_120.const",
+      "file_size": 57344
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 745644032,
+      "file_name": ".cache\\MatMulNBits_2_0_121.const",
+      "file_size": 29360128
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 775004160,
+      "file_name": ".cache\\MatMulNBits_2_0_122.const",
+      "file_size": 1835008
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 776839168,
+      "file_name": ".cache\\MatMulNBits_2_0_123.const",
+      "file_size": 229376
+    },
+    "model.layers.4.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 777068544,
+      "file_name": ".cache\\MatMulNBits_2_0_124.const",
+      "file_size": 57344
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 777125888,
+      "file_name": ".cache\\MatMulNBits_2_0_125.const",
+      "file_size": 58720256
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 835846144,
+      "file_name": ".cache\\MatMulNBits_2_0_126.const",
+      "file_size": 16384
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 835862528,
+      "file_name": ".cache\\MatMulNBits_2_0_127.const",
+      "file_size": 1835008
+    },
+    "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 837697536,
+      "file_name": ".cache\\MatMulNBits_2_0_128.const",
+      "file_size": 458752
+    },
+    "model.layers.5.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 838156288,
+      "file_name": ".cache\\MatMulNBits_2_0_129.const",
+      "file_size": 8192
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 838164480,
+      "file_name": ".cache\\MatMulNBits_2_0_130.const",
+      "file_size": 20971520
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 859136000,
+      "file_name": ".cache\\MatMulNBits_2_0_131.const",
+      "file_size": 20480
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 859156480,
+      "file_name": ".cache\\MatMulNBits_2_0_132.const",
+      "file_size": 655360
+    },
+    "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 859811840,
+      "file_name": ".cache\\MatMulNBits_2_0_133.const",
+      "file_size": 163840
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 859975680,
+      "file_name": ".cache\\MatMulNBits_2_0_134.const",
+      "file_size": 4194304
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 864169984,
+      "file_name": ".cache\\MatMulNBits_2_0_135.const",
+      "file_size": 4096
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 864174080,
+      "file_name": ".cache\\MatMulNBits_2_0_136.const",
+      "file_size": 131072
+    },
+    "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 864305152,
+      "file_name": ".cache\\MatMulNBits_2_0_137.const",
+      "file_size": 32768
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 864337920,
+      "file_name": ".cache\\MatMulNBits_2_0_138.const",
+      "file_size": 16777216
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 881115136,
+      "file_name": ".cache\\MatMulNBits_2_0_139.const",
+      "file_size": 16384
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 881131520,
+      "file_name": ".cache\\MatMulNBits_2_0_140.const",
+      "file_size": 524288
+    },
+    "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 881655808,
+      "file_name": ".cache\\MatMulNBits_2_0_141.const",
+      "file_size": 131072
+    },
+    "model.layers.5.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 881786880,
+      "file_name": ".cache\\MatMulNBits_2_0_142.const",
+      "file_size": 8192
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 881795072,
+      "file_name": ".cache\\MatMulNBits_2_0_143.const",
+      "file_size": 29360128
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 911155200,
+      "file_name": ".cache\\MatMulNBits_2_0_144.const",
+      "file_size": 1835008
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 912990208,
+      "file_name": ".cache\\MatMulNBits_2_0_145.const",
+      "file_size": 229376
+    },
+    "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 913219584,
+      "file_name": ".cache\\MatMulNBits_2_0_146.const",
+      "file_size": 57344
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 913276928,
+      "file_name": ".cache\\MatMulNBits_2_0_147.const",
+      "file_size": 29360128
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 942637056,
+      "file_name": ".cache\\MatMulNBits_2_0_148.const",
+      "file_size": 1835008
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 944472064,
+      "file_name": ".cache\\MatMulNBits_2_0_149.const",
+      "file_size": 229376
+    },
+    "model.layers.5.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 944701440,
+      "file_name": ".cache\\MatMulNBits_2_0_150.const",
+      "file_size": 57344
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 944758784,
+      "file_name": ".cache\\MatMulNBits_2_0_151.const",
+      "file_size": 58720256
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1003479040,
+      "file_name": ".cache\\MatMulNBits_2_0_152.const",
+      "file_size": 16384
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1003495424,
+      "file_name": ".cache\\MatMulNBits_2_0_153.const",
+      "file_size": 1835008
+    },
+    "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1005330432,
+      "file_name": ".cache\\MatMulNBits_2_0_154.const",
+      "file_size": 458752
+    },
+    "model.layers.6.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1005789184,
+      "file_name": ".cache\\MatMulNBits_2_0_155.const",
+      "file_size": 8192
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1005797376,
+      "file_name": ".cache\\MatMulNBits_2_0_156.const",
+      "file_size": 20971520
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1026768896,
+      "file_name": ".cache\\MatMulNBits_2_0_157.const",
+      "file_size": 20480
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1026789376,
+      "file_name": ".cache\\MatMulNBits_2_0_158.const",
+      "file_size": 655360
+    },
+    "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1027444736,
+      "file_name": ".cache\\MatMulNBits_2_0_159.const",
+      "file_size": 163840
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1027608576,
+      "file_name": ".cache\\MatMulNBits_2_0_160.const",
+      "file_size": 4194304
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1031802880,
+      "file_name": ".cache\\MatMulNBits_2_0_161.const",
+      "file_size": 4096
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1031806976,
+      "file_name": ".cache\\MatMulNBits_2_0_162.const",
+      "file_size": 131072
+    },
+    "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1031938048,
+      "file_name": ".cache\\MatMulNBits_2_0_163.const",
+      "file_size": 32768
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1031970816,
+      "file_name": ".cache\\MatMulNBits_2_0_164.const",
+      "file_size": 16777216
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1048748032,
+      "file_name": ".cache\\MatMulNBits_2_0_165.const",
+      "file_size": 16384
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1048764416,
+      "file_name": ".cache\\MatMulNBits_2_0_166.const",
+      "file_size": 524288
+    },
+    "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1049288704,
+      "file_name": ".cache\\MatMulNBits_2_0_167.const",
+      "file_size": 131072
+    },
+    "model.layers.6.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1049419776,
+      "file_name": ".cache\\MatMulNBits_2_0_168.const",
+      "file_size": 8192
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1049427968,
+      "file_name": ".cache\\MatMulNBits_2_0_169.const",
+      "file_size": 29360128
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1078788096,
+      "file_name": ".cache\\MatMulNBits_2_0_170.const",
+      "file_size": 1835008
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1080623104,
+      "file_name": ".cache\\MatMulNBits_2_0_171.const",
+      "file_size": 229376
+    },
+    "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1080852480,
+      "file_name": ".cache\\MatMulNBits_2_0_172.const",
+      "file_size": 57344
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1080909824,
+      "file_name": ".cache\\MatMulNBits_2_0_173.const",
+      "file_size": 29360128
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1110269952,
+      "file_name": ".cache\\MatMulNBits_2_0_174.const",
+      "file_size": 1835008
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1112104960,
+      "file_name": ".cache\\MatMulNBits_2_0_175.const",
+      "file_size": 229376
+    },
+    "model.layers.6.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1112334336,
+      "file_name": ".cache\\MatMulNBits_2_0_176.const",
+      "file_size": 57344
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1112391680,
+      "file_name": ".cache\\MatMulNBits_2_0_177.const",
+      "file_size": 58720256
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1171111936,
+      "file_name": ".cache\\MatMulNBits_2_0_178.const",
+      "file_size": 16384
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1171128320,
+      "file_name": ".cache\\MatMulNBits_2_0_179.const",
+      "file_size": 1835008
+    },
+    "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1172963328,
+      "file_name": ".cache\\MatMulNBits_2_0_180.const",
+      "file_size": 458752
+    },
+    "model.layers.7.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1173422080,
+      "file_name": ".cache\\MatMulNBits_2_0_181.const",
+      "file_size": 8192
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1173430272,
+      "file_name": ".cache\\MatMulNBits_2_0_182.const",
+      "file_size": 20971520
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1194401792,
+      "file_name": ".cache\\MatMulNBits_2_0_183.const",
+      "file_size": 20480
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1194422272,
+      "file_name": ".cache\\MatMulNBits_2_0_184.const",
+      "file_size": 655360
+    },
+    "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1195077632,
+      "file_name": ".cache\\MatMulNBits_2_0_185.const",
+      "file_size": 163840
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1195241472,
+      "file_name": ".cache\\MatMulNBits_2_0_186.const",
+      "file_size": 4194304
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1199435776,
+      "file_name": ".cache\\MatMulNBits_2_0_187.const",
+      "file_size": 4096
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1199439872,
+      "file_name": ".cache\\MatMulNBits_2_0_188.const",
+      "file_size": 131072
+    },
+    "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1199570944,
+      "file_name": ".cache\\MatMulNBits_2_0_189.const",
+      "file_size": 32768
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1199603712,
+      "file_name": ".cache\\MatMulNBits_2_0_190.const",
+      "file_size": 16777216
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1216380928,
+      "file_name": ".cache\\MatMulNBits_2_0_191.const",
+      "file_size": 16384
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1216397312,
+      "file_name": ".cache\\MatMulNBits_2_0_192.const",
+      "file_size": 524288
+    },
+    "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1216921600,
+      "file_name": ".cache\\MatMulNBits_2_0_193.const",
+      "file_size": 131072
+    },
+    "model.layers.7.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1217052672,
+      "file_name": ".cache\\MatMulNBits_2_0_194.const",
+      "file_size": 8192
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1217060864,
+      "file_name": ".cache\\MatMulNBits_2_0_195.const",
+      "file_size": 29360128
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1246420992,
+      "file_name": ".cache\\MatMulNBits_2_0_196.const",
+      "file_size": 1835008
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1248256000,
+      "file_name": ".cache\\MatMulNBits_2_0_197.const",
+      "file_size": 229376
+    },
+    "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1248485376,
+      "file_name": ".cache\\MatMulNBits_2_0_198.const",
+      "file_size": 57344
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1248542720,
+      "file_name": ".cache\\MatMulNBits_2_0_199.const",
+      "file_size": 29360128
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1277902848,
+      "file_name": ".cache\\MatMulNBits_2_0_200.const",
+      "file_size": 1835008
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1279737856,
+      "file_name": ".cache\\MatMulNBits_2_0_201.const",
+      "file_size": 229376
+    },
+    "model.layers.7.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1279967232,
+      "file_name": ".cache\\MatMulNBits_2_0_202.const",
+      "file_size": 57344
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1280024576,
+      "file_name": ".cache\\MatMulNBits_2_0_203.const",
+      "file_size": 58720256
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1338744832,
+      "file_name": ".cache\\MatMulNBits_2_0_204.const",
+      "file_size": 16384
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1338761216,
+      "file_name": ".cache\\MatMulNBits_2_0_205.const",
+      "file_size": 1835008
+    },
+    "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1340596224,
+      "file_name": ".cache\\MatMulNBits_2_0_206.const",
+      "file_size": 458752
+    },
+    "model.layers.8.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1341054976,
+      "file_name": ".cache\\MatMulNBits_2_0_207.const",
+      "file_size": 8192
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1341063168,
+      "file_name": ".cache\\MatMulNBits_2_0_208.const",
+      "file_size": 20971520
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1362034688,
+      "file_name": ".cache\\MatMulNBits_2_0_209.const",
+      "file_size": 20480
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1362055168,
+      "file_name": ".cache\\MatMulNBits_2_0_210.const",
+      "file_size": 655360
+    },
+    "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1362710528,
+      "file_name": ".cache\\MatMulNBits_2_0_211.const",
+      "file_size": 163840
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1362874368,
+      "file_name": ".cache\\MatMulNBits_2_0_212.const",
+      "file_size": 4194304
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1367068672,
+      "file_name": ".cache\\MatMulNBits_2_0_213.const",
+      "file_size": 4096
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1367072768,
+      "file_name": ".cache\\MatMulNBits_2_0_214.const",
+      "file_size": 131072
+    },
+    "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1367203840,
+      "file_name": ".cache\\MatMulNBits_2_0_215.const",
+      "file_size": 32768
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1367236608,
+      "file_name": ".cache\\MatMulNBits_2_0_216.const",
+      "file_size": 16777216
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1384013824,
+      "file_name": ".cache\\MatMulNBits_2_0_217.const",
+      "file_size": 16384
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1384030208,
+      "file_name": ".cache\\MatMulNBits_2_0_218.const",
+      "file_size": 524288
+    },
+    "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1384554496,
+      "file_name": ".cache\\MatMulNBits_2_0_219.const",
+      "file_size": 131072
+    },
+    "model.layers.8.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1384685568,
+      "file_name": ".cache\\MatMulNBits_2_0_220.const",
+      "file_size": 8192
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1384693760,
+      "file_name": ".cache\\MatMulNBits_2_0_221.const",
+      "file_size": 29360128
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1414053888,
+      "file_name": ".cache\\MatMulNBits_2_0_222.const",
+      "file_size": 1835008
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1415888896,
+      "file_name": ".cache\\MatMulNBits_2_0_223.const",
+      "file_size": 229376
+    },
+    "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1416118272,
+      "file_name": ".cache\\MatMulNBits_2_0_224.const",
+      "file_size": 57344
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1416175616,
+      "file_name": ".cache\\MatMulNBits_2_0_225.const",
+      "file_size": 29360128
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1445535744,
+      "file_name": ".cache\\MatMulNBits_2_0_226.const",
+      "file_size": 1835008
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1447370752,
+      "file_name": ".cache\\MatMulNBits_2_0_227.const",
+      "file_size": 229376
+    },
+    "model.layers.8.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1447600128,
+      "file_name": ".cache\\MatMulNBits_2_0_228.const",
+      "file_size": 57344
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1447657472,
+      "file_name": ".cache\\MatMulNBits_2_0_229.const",
+      "file_size": 58720256
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1506377728,
+      "file_name": ".cache\\MatMulNBits_2_0_230.const",
+      "file_size": 16384
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1506394112,
+      "file_name": ".cache\\MatMulNBits_2_0_231.const",
+      "file_size": 1835008
+    },
+    "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1508229120,
+      "file_name": ".cache\\MatMulNBits_2_0_232.const",
+      "file_size": 458752
+    },
+    "model.layers.9.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1508687872,
+      "file_name": ".cache\\MatMulNBits_2_0_233.const",
+      "file_size": 8192
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1508696064,
+      "file_name": ".cache\\MatMulNBits_2_0_234.const",
+      "file_size": 20971520
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1529667584,
+      "file_name": ".cache\\MatMulNBits_2_0_235.const",
+      "file_size": 20480
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1529688064,
+      "file_name": ".cache\\MatMulNBits_2_0_236.const",
+      "file_size": 655360
+    },
+    "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1530343424,
+      "file_name": ".cache\\MatMulNBits_2_0_237.const",
+      "file_size": 163840
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1530507264,
+      "file_name": ".cache\\MatMulNBits_2_0_238.const",
+      "file_size": 4194304
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1534701568,
+      "file_name": ".cache\\MatMulNBits_2_0_239.const",
+      "file_size": 4096
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1534705664,
+      "file_name": ".cache\\MatMulNBits_2_0_240.const",
+      "file_size": 131072
+    },
+    "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1534836736,
+      "file_name": ".cache\\MatMulNBits_2_0_241.const",
+      "file_size": 32768
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1534869504,
+      "file_name": ".cache\\MatMulNBits_2_0_242.const",
+      "file_size": 16777216
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1551646720,
+      "file_name": ".cache\\MatMulNBits_2_0_243.const",
+      "file_size": 16384
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1551663104,
+      "file_name": ".cache\\MatMulNBits_2_0_244.const",
+      "file_size": 524288
+    },
+    "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1552187392,
+      "file_name": ".cache\\MatMulNBits_2_0_245.const",
+      "file_size": 131072
+    },
+    "model.layers.9.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1552318464,
+      "file_name": ".cache\\MatMulNBits_2_0_246.const",
+      "file_size": 8192
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1552326656,
+      "file_name": ".cache\\MatMulNBits_2_0_247.const",
+      "file_size": 29360128
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1581686784,
+      "file_name": ".cache\\MatMulNBits_2_0_248.const",
+      "file_size": 1835008
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1583521792,
+      "file_name": ".cache\\MatMulNBits_2_0_249.const",
+      "file_size": 229376
+    },
+    "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1583751168,
+      "file_name": ".cache\\MatMulNBits_2_0_250.const",
+      "file_size": 57344
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1583808512,
+      "file_name": ".cache\\MatMulNBits_2_0_251.const",
+      "file_size": 29360128
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1613168640,
+      "file_name": ".cache\\MatMulNBits_2_0_252.const",
+      "file_size": 1835008
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1615003648,
+      "file_name": ".cache\\MatMulNBits_2_0_253.const",
+      "file_size": 229376
+    },
+    "model.layers.9.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1615233024,
+      "file_name": ".cache\\MatMulNBits_2_0_254.const",
+      "file_size": 57344
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1615290368,
+      "file_name": ".cache\\MatMulNBits_2_0_255.const",
+      "file_size": 58720256
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1674010624,
+      "file_name": ".cache\\MatMulNBits_2_0_256.const",
+      "file_size": 16384
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1674027008,
+      "file_name": ".cache\\MatMulNBits_2_0_257.const",
+      "file_size": 1835008
+    },
+    "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1675862016,
+      "file_name": ".cache\\MatMulNBits_2_0_258.const",
+      "file_size": 458752
+    },
+    "model.layers.10.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1676320768,
+      "file_name": ".cache\\MatMulNBits_2_0_259.const",
+      "file_size": 8192
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1676328960,
+      "file_name": ".cache\\MatMulNBits_2_0_260.const",
+      "file_size": 20971520
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1697300480,
+      "file_name": ".cache\\MatMulNBits_2_0_261.const",
+      "file_size": 20480
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1697320960,
+      "file_name": ".cache\\MatMulNBits_2_0_262.const",
+      "file_size": 655360
+    },
+    "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1697976320,
+      "file_name": ".cache\\MatMulNBits_2_0_263.const",
+      "file_size": 163840
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1698140160,
+      "file_name": ".cache\\MatMulNBits_2_0_264.const",
+      "file_size": 4194304
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1702334464,
+      "file_name": ".cache\\MatMulNBits_2_0_265.const",
+      "file_size": 4096
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1702338560,
+      "file_name": ".cache\\MatMulNBits_2_0_266.const",
+      "file_size": 131072
+    },
+    "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1702469632,
+      "file_name": ".cache\\MatMulNBits_2_0_267.const",
+      "file_size": 32768
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1702502400,
+      "file_name": ".cache\\MatMulNBits_2_0_268.const",
+      "file_size": 16777216
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1719279616,
+      "file_name": ".cache\\MatMulNBits_2_0_269.const",
+      "file_size": 16384
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1719296000,
+      "file_name": ".cache\\MatMulNBits_2_0_270.const",
+      "file_size": 524288
+    },
+    "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1719820288,
+      "file_name": ".cache\\MatMulNBits_2_0_271.const",
+      "file_size": 131072
+    },
+    "model.layers.10.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1719951360,
+      "file_name": ".cache\\MatMulNBits_2_0_272.const",
+      "file_size": 8192
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1719959552,
+      "file_name": ".cache\\MatMulNBits_2_0_273.const",
+      "file_size": 29360128
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1749319680,
+      "file_name": ".cache\\MatMulNBits_2_0_274.const",
+      "file_size": 1835008
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1751154688,
+      "file_name": ".cache\\MatMulNBits_2_0_275.const",
+      "file_size": 229376
+    },
+    "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1751384064,
+      "file_name": ".cache\\MatMulNBits_2_0_276.const",
+      "file_size": 57344
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1751441408,
+      "file_name": ".cache\\MatMulNBits_2_0_277.const",
+      "file_size": 29360128
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1780801536,
+      "file_name": ".cache\\MatMulNBits_2_0_278.const",
+      "file_size": 1835008
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1782636544,
+      "file_name": ".cache\\MatMulNBits_2_0_279.const",
+      "file_size": 229376
+    },
+    "model.layers.10.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1782865920,
+      "file_name": ".cache\\MatMulNBits_2_0_280.const",
+      "file_size": 57344
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1782923264,
+      "file_name": ".cache\\MatMulNBits_2_0_281.const",
+      "file_size": 58720256
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1841643520,
+      "file_name": ".cache\\MatMulNBits_2_0_282.const",
+      "file_size": 16384
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1841659904,
+      "file_name": ".cache\\MatMulNBits_2_0_283.const",
+      "file_size": 1835008
+    },
+    "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 1843494912,
+      "file_name": ".cache\\MatMulNBits_2_0_284.const",
+      "file_size": 458752
+    },
+    "model.layers.11.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1843953664,
+      "file_name": ".cache\\MatMulNBits_2_0_285.const",
+      "file_size": 8192
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 1843961856,
+      "file_name": ".cache\\MatMulNBits_2_0_286.const",
+      "file_size": 20971520
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 1864933376,
+      "file_name": ".cache\\MatMulNBits_2_0_287.const",
+      "file_size": 20480
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 1864953856,
+      "file_name": ".cache\\MatMulNBits_2_0_288.const",
+      "file_size": 655360
+    },
+    "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 1865609216,
+      "file_name": ".cache\\MatMulNBits_2_0_289.const",
+      "file_size": 163840
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 1865773056,
+      "file_name": ".cache\\MatMulNBits_2_0_290.const",
+      "file_size": 4194304
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 1869967360,
+      "file_name": ".cache\\MatMulNBits_2_0_291.const",
+      "file_size": 4096
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1869971456,
+      "file_name": ".cache\\MatMulNBits_2_0_292.const",
+      "file_size": 131072
+    },
+    "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 1870102528,
+      "file_name": ".cache\\MatMulNBits_2_0_293.const",
+      "file_size": 32768
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 1870135296,
+      "file_name": ".cache\\MatMulNBits_2_0_294.const",
+      "file_size": 16777216
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 1886912512,
+      "file_name": ".cache\\MatMulNBits_2_0_295.const",
+      "file_size": 16384
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 1886928896,
+      "file_name": ".cache\\MatMulNBits_2_0_296.const",
+      "file_size": 524288
+    },
+    "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 1887453184,
+      "file_name": ".cache\\MatMulNBits_2_0_297.const",
+      "file_size": 131072
+    },
+    "model.layers.11.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 1887584256,
+      "file_name": ".cache\\MatMulNBits_2_0_298.const",
+      "file_size": 8192
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1887592448,
+      "file_name": ".cache\\MatMulNBits_2_0_299.const",
+      "file_size": 29360128
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1916952576,
+      "file_name": ".cache\\MatMulNBits_2_0_300.const",
+      "file_size": 1835008
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1918787584,
+      "file_name": ".cache\\MatMulNBits_2_0_301.const",
+      "file_size": 229376
+    },
+    "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1919016960,
+      "file_name": ".cache\\MatMulNBits_2_0_302.const",
+      "file_size": 57344
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 1919074304,
+      "file_name": ".cache\\MatMulNBits_2_0_303.const",
+      "file_size": 29360128
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 1948434432,
+      "file_name": ".cache\\MatMulNBits_2_0_304.const",
+      "file_size": 1835008
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 1950269440,
+      "file_name": ".cache\\MatMulNBits_2_0_305.const",
+      "file_size": 229376
+    },
+    "model.layers.11.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 1950498816,
+      "file_name": ".cache\\MatMulNBits_2_0_306.const",
+      "file_size": 57344
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 1950556160,
+      "file_name": ".cache\\MatMulNBits_2_0_307.const",
+      "file_size": 58720256
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2009276416,
+      "file_name": ".cache\\MatMulNBits_2_0_308.const",
+      "file_size": 16384
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2009292800,
+      "file_name": ".cache\\MatMulNBits_2_0_309.const",
+      "file_size": 1835008
+    },
+    "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2011127808,
+      "file_name": ".cache\\MatMulNBits_2_0_310.const",
+      "file_size": 458752
+    },
+    "model.layers.12.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2011586560,
+      "file_name": ".cache\\MatMulNBits_2_0_311.const",
+      "file_size": 8192
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2011594752,
+      "file_name": ".cache\\MatMulNBits_2_0_312.const",
+      "file_size": 20971520
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2032566272,
+      "file_name": ".cache\\MatMulNBits_2_0_313.const",
+      "file_size": 20480
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2032586752,
+      "file_name": ".cache\\MatMulNBits_2_0_314.const",
+      "file_size": 655360
+    },
+    "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2033242112,
+      "file_name": ".cache\\MatMulNBits_2_0_315.const",
+      "file_size": 163840
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2033405952,
+      "file_name": ".cache\\MatMulNBits_2_0_316.const",
+      "file_size": 4194304
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2037600256,
+      "file_name": ".cache\\MatMulNBits_2_0_317.const",
+      "file_size": 4096
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2037604352,
+      "file_name": ".cache\\MatMulNBits_2_0_318.const",
+      "file_size": 131072
+    },
+    "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2037735424,
+      "file_name": ".cache\\MatMulNBits_2_0_319.const",
+      "file_size": 32768
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2037768192,
+      "file_name": ".cache\\MatMulNBits_2_0_320.const",
+      "file_size": 16777216
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2054545408,
+      "file_name": ".cache\\MatMulNBits_2_0_321.const",
+      "file_size": 16384
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2054561792,
+      "file_name": ".cache\\MatMulNBits_2_0_322.const",
+      "file_size": 524288
+    },
+    "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2055086080,
+      "file_name": ".cache\\MatMulNBits_2_0_323.const",
+      "file_size": 131072
+    },
+    "model.layers.12.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2055217152,
+      "file_name": ".cache\\MatMulNBits_2_0_324.const",
+      "file_size": 8192
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2055225344,
+      "file_name": ".cache\\MatMulNBits_2_0_325.const",
+      "file_size": 29360128
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2084585472,
+      "file_name": ".cache\\MatMulNBits_2_0_326.const",
+      "file_size": 1835008
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2086420480,
+      "file_name": ".cache\\MatMulNBits_2_0_327.const",
+      "file_size": 229376
+    },
+    "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2086649856,
+      "file_name": ".cache\\MatMulNBits_2_0_328.const",
+      "file_size": 57344
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2086707200,
+      "file_name": ".cache\\MatMulNBits_2_0_329.const",
+      "file_size": 29360128
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2116067328,
+      "file_name": ".cache\\MatMulNBits_2_0_330.const",
+      "file_size": 1835008
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2117902336,
+      "file_name": ".cache\\MatMulNBits_2_0_331.const",
+      "file_size": 229376
+    },
+    "model.layers.12.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2118131712,
+      "file_name": ".cache\\MatMulNBits_2_0_332.const",
+      "file_size": 57344
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2118189056,
+      "file_name": ".cache\\MatMulNBits_2_0_333.const",
+      "file_size": 58720256
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2176909312,
+      "file_name": ".cache\\MatMulNBits_2_0_334.const",
+      "file_size": 16384
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2176925696,
+      "file_name": ".cache\\MatMulNBits_2_0_335.const",
+      "file_size": 1835008
+    },
+    "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2178760704,
+      "file_name": ".cache\\MatMulNBits_2_0_336.const",
+      "file_size": 458752
+    },
+    "model.layers.13.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2179219456,
+      "file_name": ".cache\\MatMulNBits_2_0_337.const",
+      "file_size": 8192
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2179227648,
+      "file_name": ".cache\\MatMulNBits_2_0_338.const",
+      "file_size": 20971520
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2200199168,
+      "file_name": ".cache\\MatMulNBits_2_0_339.const",
+      "file_size": 20480
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2200219648,
+      "file_name": ".cache\\MatMulNBits_2_0_340.const",
+      "file_size": 655360
+    },
+    "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2200875008,
+      "file_name": ".cache\\MatMulNBits_2_0_341.const",
+      "file_size": 163840
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2201038848,
+      "file_name": ".cache\\MatMulNBits_2_0_342.const",
+      "file_size": 4194304
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2205233152,
+      "file_name": ".cache\\MatMulNBits_2_0_343.const",
+      "file_size": 4096
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2205237248,
+      "file_name": ".cache\\MatMulNBits_2_0_344.const",
+      "file_size": 131072
+    },
+    "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2205368320,
+      "file_name": ".cache\\MatMulNBits_2_0_345.const",
+      "file_size": 32768
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2205401088,
+      "file_name": ".cache\\MatMulNBits_2_0_346.const",
+      "file_size": 16777216
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2222178304,
+      "file_name": ".cache\\MatMulNBits_2_0_347.const",
+      "file_size": 16384
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2222194688,
+      "file_name": ".cache\\MatMulNBits_2_0_348.const",
+      "file_size": 524288
+    },
+    "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2222718976,
+      "file_name": ".cache\\MatMulNBits_2_0_349.const",
+      "file_size": 131072
+    },
+    "model.layers.13.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2222850048,
+      "file_name": ".cache\\MatMulNBits_2_0_350.const",
+      "file_size": 8192
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2222858240,
+      "file_name": ".cache\\MatMulNBits_2_0_351.const",
+      "file_size": 29360128
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2252218368,
+      "file_name": ".cache\\MatMulNBits_2_0_352.const",
+      "file_size": 1835008
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2254053376,
+      "file_name": ".cache\\MatMulNBits_2_0_353.const",
+      "file_size": 229376
+    },
+    "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2254282752,
+      "file_name": ".cache\\MatMulNBits_2_0_354.const",
+      "file_size": 57344
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2254340096,
+      "file_name": ".cache\\MatMulNBits_2_0_355.const",
+      "file_size": 29360128
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2283700224,
+      "file_name": ".cache\\MatMulNBits_2_0_356.const",
+      "file_size": 1835008
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2285535232,
+      "file_name": ".cache\\MatMulNBits_2_0_357.const",
+      "file_size": 229376
+    },
+    "model.layers.13.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2285764608,
+      "file_name": ".cache\\MatMulNBits_2_0_358.const",
+      "file_size": 57344
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2285821952,
+      "file_name": ".cache\\MatMulNBits_2_0_359.const",
+      "file_size": 58720256
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2344542208,
+      "file_name": ".cache\\MatMulNBits_2_0_360.const",
+      "file_size": 16384
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2344558592,
+      "file_name": ".cache\\MatMulNBits_2_0_361.const",
+      "file_size": 1835008
+    },
+    "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2346393600,
+      "file_name": ".cache\\MatMulNBits_2_0_362.const",
+      "file_size": 458752
+    },
+    "model.layers.14.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2346852352,
+      "file_name": ".cache\\MatMulNBits_2_0_363.const",
+      "file_size": 8192
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2346860544,
+      "file_name": ".cache\\MatMulNBits_2_0_364.const",
+      "file_size": 20971520
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2367832064,
+      "file_name": ".cache\\MatMulNBits_2_0_365.const",
+      "file_size": 20480
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2367852544,
+      "file_name": ".cache\\MatMulNBits_2_0_366.const",
+      "file_size": 655360
+    },
+    "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2368507904,
+      "file_name": ".cache\\MatMulNBits_2_0_367.const",
+      "file_size": 163840
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2368671744,
+      "file_name": ".cache\\MatMulNBits_2_0_368.const",
+      "file_size": 4194304
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2372866048,
+      "file_name": ".cache\\MatMulNBits_2_0_369.const",
+      "file_size": 4096
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2372870144,
+      "file_name": ".cache\\MatMulNBits_2_0_370.const",
+      "file_size": 131072
+    },
+    "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2373001216,
+      "file_name": ".cache\\MatMulNBits_2_0_371.const",
+      "file_size": 32768
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2373033984,
+      "file_name": ".cache\\MatMulNBits_2_0_372.const",
+      "file_size": 16777216
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2389811200,
+      "file_name": ".cache\\MatMulNBits_2_0_373.const",
+      "file_size": 16384
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2389827584,
+      "file_name": ".cache\\MatMulNBits_2_0_374.const",
+      "file_size": 524288
+    },
+    "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2390351872,
+      "file_name": ".cache\\MatMulNBits_2_0_375.const",
+      "file_size": 131072
+    },
+    "model.layers.14.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2390482944,
+      "file_name": ".cache\\MatMulNBits_2_0_376.const",
+      "file_size": 8192
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2390491136,
+      "file_name": ".cache\\MatMulNBits_2_0_377.const",
+      "file_size": 29360128
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2419851264,
+      "file_name": ".cache\\MatMulNBits_2_0_378.const",
+      "file_size": 1835008
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2421686272,
+      "file_name": ".cache\\MatMulNBits_2_0_379.const",
+      "file_size": 229376
+    },
+    "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2421915648,
+      "file_name": ".cache\\MatMulNBits_2_0_380.const",
+      "file_size": 57344
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2421972992,
+      "file_name": ".cache\\MatMulNBits_2_0_381.const",
+      "file_size": 29360128
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2451333120,
+      "file_name": ".cache\\MatMulNBits_2_0_382.const",
+      "file_size": 1835008
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2453168128,
+      "file_name": ".cache\\MatMulNBits_2_0_383.const",
+      "file_size": 229376
+    },
+    "model.layers.14.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2453397504,
+      "file_name": ".cache\\MatMulNBits_2_0_384.const",
+      "file_size": 57344
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2453454848,
+      "file_name": ".cache\\MatMulNBits_2_0_385.const",
+      "file_size": 58720256
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2512175104,
+      "file_name": ".cache\\MatMulNBits_2_0_386.const",
+      "file_size": 16384
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2512191488,
+      "file_name": ".cache\\MatMulNBits_2_0_387.const",
+      "file_size": 1835008
+    },
+    "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2514026496,
+      "file_name": ".cache\\MatMulNBits_2_0_388.const",
+      "file_size": 458752
+    },
+    "model.layers.15.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2514485248,
+      "file_name": ".cache\\MatMulNBits_2_0_389.const",
+      "file_size": 8192
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2514493440,
+      "file_name": ".cache\\MatMulNBits_2_0_390.const",
+      "file_size": 20971520
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2535464960,
+      "file_name": ".cache\\MatMulNBits_2_0_391.const",
+      "file_size": 20480
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2535485440,
+      "file_name": ".cache\\MatMulNBits_2_0_392.const",
+      "file_size": 655360
+    },
+    "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2536140800,
+      "file_name": ".cache\\MatMulNBits_2_0_393.const",
+      "file_size": 163840
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2536304640,
+      "file_name": ".cache\\MatMulNBits_2_0_394.const",
+      "file_size": 4194304
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2540498944,
+      "file_name": ".cache\\MatMulNBits_2_0_395.const",
+      "file_size": 4096
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2540503040,
+      "file_name": ".cache\\MatMulNBits_2_0_396.const",
+      "file_size": 131072
+    },
+    "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2540634112,
+      "file_name": ".cache\\MatMulNBits_2_0_397.const",
+      "file_size": 32768
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2540666880,
+      "file_name": ".cache\\MatMulNBits_2_0_398.const",
+      "file_size": 16777216
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2557444096,
+      "file_name": ".cache\\MatMulNBits_2_0_399.const",
+      "file_size": 16384
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2557460480,
+      "file_name": ".cache\\MatMulNBits_2_0_400.const",
+      "file_size": 524288
+    },
+    "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2557984768,
+      "file_name": ".cache\\MatMulNBits_2_0_401.const",
+      "file_size": 131072
+    },
+    "model.layers.15.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2558115840,
+      "file_name": ".cache\\MatMulNBits_2_0_402.const",
+      "file_size": 8192
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2558124032,
+      "file_name": ".cache\\MatMulNBits_2_0_403.const",
+      "file_size": 29360128
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2587484160,
+      "file_name": ".cache\\MatMulNBits_2_0_404.const",
+      "file_size": 1835008
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2589319168,
+      "file_name": ".cache\\MatMulNBits_2_0_405.const",
+      "file_size": 229376
+    },
+    "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2589548544,
+      "file_name": ".cache\\MatMulNBits_2_0_406.const",
+      "file_size": 57344
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2589605888,
+      "file_name": ".cache\\MatMulNBits_2_0_407.const",
+      "file_size": 29360128
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2618966016,
+      "file_name": ".cache\\MatMulNBits_2_0_408.const",
+      "file_size": 1835008
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2620801024,
+      "file_name": ".cache\\MatMulNBits_2_0_409.const",
+      "file_size": 229376
+    },
+    "model.layers.15.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2621030400,
+      "file_name": ".cache\\MatMulNBits_2_0_410.const",
+      "file_size": 57344
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2621087744,
+      "file_name": ".cache\\MatMulNBits_2_0_411.const",
+      "file_size": 58720256
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2679808000,
+      "file_name": ".cache\\MatMulNBits_2_0_412.const",
+      "file_size": 16384
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2679824384,
+      "file_name": ".cache\\MatMulNBits_2_0_413.const",
+      "file_size": 1835008
+    },
+    "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2681659392,
+      "file_name": ".cache\\MatMulNBits_2_0_414.const",
+      "file_size": 458752
+    },
+    "model.layers.16.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2682118144,
+      "file_name": ".cache\\MatMulNBits_2_0_415.const",
+      "file_size": 8192
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2682126336,
+      "file_name": ".cache\\MatMulNBits_2_0_416.const",
+      "file_size": 20971520
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2703097856,
+      "file_name": ".cache\\MatMulNBits_2_0_417.const",
+      "file_size": 20480
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2703118336,
+      "file_name": ".cache\\MatMulNBits_2_0_418.const",
+      "file_size": 655360
+    },
+    "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2703773696,
+      "file_name": ".cache\\MatMulNBits_2_0_419.const",
+      "file_size": 163840
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2703937536,
+      "file_name": ".cache\\MatMulNBits_2_0_420.const",
+      "file_size": 4194304
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2708131840,
+      "file_name": ".cache\\MatMulNBits_2_0_421.const",
+      "file_size": 4096
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2708135936,
+      "file_name": ".cache\\MatMulNBits_2_0_422.const",
+      "file_size": 131072
+    },
+    "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2708267008,
+      "file_name": ".cache\\MatMulNBits_2_0_423.const",
+      "file_size": 32768
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2708299776,
+      "file_name": ".cache\\MatMulNBits_2_0_424.const",
+      "file_size": 16777216
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2725076992,
+      "file_name": ".cache\\MatMulNBits_2_0_425.const",
+      "file_size": 16384
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2725093376,
+      "file_name": ".cache\\MatMulNBits_2_0_426.const",
+      "file_size": 524288
+    },
+    "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2725617664,
+      "file_name": ".cache\\MatMulNBits_2_0_427.const",
+      "file_size": 131072
+    },
+    "model.layers.16.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2725748736,
+      "file_name": ".cache\\MatMulNBits_2_0_428.const",
+      "file_size": 8192
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2725756928,
+      "file_name": ".cache\\MatMulNBits_2_0_429.const",
+      "file_size": 29360128
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2755117056,
+      "file_name": ".cache\\MatMulNBits_2_0_430.const",
+      "file_size": 1835008
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2756952064,
+      "file_name": ".cache\\MatMulNBits_2_0_431.const",
+      "file_size": 229376
+    },
+    "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2757181440,
+      "file_name": ".cache\\MatMulNBits_2_0_432.const",
+      "file_size": 57344
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2757238784,
+      "file_name": ".cache\\MatMulNBits_2_0_433.const",
+      "file_size": 29360128
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2786598912,
+      "file_name": ".cache\\MatMulNBits_2_0_434.const",
+      "file_size": 1835008
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2788433920,
+      "file_name": ".cache\\MatMulNBits_2_0_435.const",
+      "file_size": 229376
+    },
+    "model.layers.16.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2788663296,
+      "file_name": ".cache\\MatMulNBits_2_0_436.const",
+      "file_size": 57344
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2788720640,
+      "file_name": ".cache\\MatMulNBits_2_0_437.const",
+      "file_size": 58720256
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2847440896,
+      "file_name": ".cache\\MatMulNBits_2_0_438.const",
+      "file_size": 16384
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2847457280,
+      "file_name": ".cache\\MatMulNBits_2_0_439.const",
+      "file_size": 1835008
+    },
+    "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 2849292288,
+      "file_name": ".cache\\MatMulNBits_2_0_440.const",
+      "file_size": 458752
+    },
+    "model.layers.17.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2849751040,
+      "file_name": ".cache\\MatMulNBits_2_0_441.const",
+      "file_size": 8192
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 2849759232,
+      "file_name": ".cache\\MatMulNBits_2_0_442.const",
+      "file_size": 20971520
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 2870730752,
+      "file_name": ".cache\\MatMulNBits_2_0_443.const",
+      "file_size": 20480
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 2870751232,
+      "file_name": ".cache\\MatMulNBits_2_0_444.const",
+      "file_size": 655360
+    },
+    "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 2871406592,
+      "file_name": ".cache\\MatMulNBits_2_0_445.const",
+      "file_size": 163840
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 2871570432,
+      "file_name": ".cache\\MatMulNBits_2_0_446.const",
+      "file_size": 4194304
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 2875764736,
+      "file_name": ".cache\\MatMulNBits_2_0_447.const",
+      "file_size": 4096
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2875768832,
+      "file_name": ".cache\\MatMulNBits_2_0_448.const",
+      "file_size": 131072
+    },
+    "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 2875899904,
+      "file_name": ".cache\\MatMulNBits_2_0_449.const",
+      "file_size": 32768
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 2875932672,
+      "file_name": ".cache\\MatMulNBits_2_0_450.const",
+      "file_size": 16777216
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 2892709888,
+      "file_name": ".cache\\MatMulNBits_2_0_451.const",
+      "file_size": 16384
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 2892726272,
+      "file_name": ".cache\\MatMulNBits_2_0_452.const",
+      "file_size": 524288
+    },
+    "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 2893250560,
+      "file_name": ".cache\\MatMulNBits_2_0_453.const",
+      "file_size": 131072
+    },
+    "model.layers.17.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 2893381632,
+      "file_name": ".cache\\MatMulNBits_2_0_454.const",
+      "file_size": 8192
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2893389824,
+      "file_name": ".cache\\MatMulNBits_2_0_455.const",
+      "file_size": 29360128
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2922749952,
+      "file_name": ".cache\\MatMulNBits_2_0_456.const",
+      "file_size": 1835008
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2924584960,
+      "file_name": ".cache\\MatMulNBits_2_0_457.const",
+      "file_size": 229376
+    },
+    "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2924814336,
+      "file_name": ".cache\\MatMulNBits_2_0_458.const",
+      "file_size": 57344
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 2924871680,
+      "file_name": ".cache\\MatMulNBits_2_0_459.const",
+      "file_size": 29360128
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 2954231808,
+      "file_name": ".cache\\MatMulNBits_2_0_460.const",
+      "file_size": 1835008
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 2956066816,
+      "file_name": ".cache\\MatMulNBits_2_0_461.const",
+      "file_size": 229376
+    },
+    "model.layers.17.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 2956296192,
+      "file_name": ".cache\\MatMulNBits_2_0_462.const",
+      "file_size": 57344
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 2956353536,
+      "file_name": ".cache\\MatMulNBits_2_0_463.const",
+      "file_size": 58720256
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3015073792,
+      "file_name": ".cache\\MatMulNBits_2_0_464.const",
+      "file_size": 16384
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3015090176,
+      "file_name": ".cache\\MatMulNBits_2_0_465.const",
+      "file_size": 1835008
+    },
+    "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3016925184,
+      "file_name": ".cache\\MatMulNBits_2_0_466.const",
+      "file_size": 458752
+    },
+    "model.layers.18.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3017383936,
+      "file_name": ".cache\\MatMulNBits_2_0_467.const",
+      "file_size": 8192
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3017392128,
+      "file_name": ".cache\\MatMulNBits_2_0_468.const",
+      "file_size": 20971520
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3038363648,
+      "file_name": ".cache\\MatMulNBits_2_0_469.const",
+      "file_size": 20480
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3038384128,
+      "file_name": ".cache\\MatMulNBits_2_0_470.const",
+      "file_size": 655360
+    },
+    "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3039039488,
+      "file_name": ".cache\\MatMulNBits_2_0_471.const",
+      "file_size": 163840
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3039203328,
+      "file_name": ".cache\\MatMulNBits_2_0_472.const",
+      "file_size": 4194304
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3043397632,
+      "file_name": ".cache\\MatMulNBits_2_0_473.const",
+      "file_size": 4096
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3043401728,
+      "file_name": ".cache\\MatMulNBits_2_0_474.const",
+      "file_size": 131072
+    },
+    "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3043532800,
+      "file_name": ".cache\\MatMulNBits_2_0_475.const",
+      "file_size": 32768
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3043565568,
+      "file_name": ".cache\\MatMulNBits_2_0_476.const",
+      "file_size": 16777216
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3060342784,
+      "file_name": ".cache\\MatMulNBits_2_0_477.const",
+      "file_size": 16384
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3060359168,
+      "file_name": ".cache\\MatMulNBits_2_0_478.const",
+      "file_size": 524288
+    },
+    "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3060883456,
+      "file_name": ".cache\\MatMulNBits_2_0_479.const",
+      "file_size": 131072
+    },
+    "model.layers.18.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3061014528,
+      "file_name": ".cache\\MatMulNBits_2_0_480.const",
+      "file_size": 8192
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3061022720,
+      "file_name": ".cache\\MatMulNBits_2_0_481.const",
+      "file_size": 29360128
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3090382848,
+      "file_name": ".cache\\MatMulNBits_2_0_482.const",
+      "file_size": 1835008
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3092217856,
+      "file_name": ".cache\\MatMulNBits_2_0_483.const",
+      "file_size": 229376
+    },
+    "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3092447232,
+      "file_name": ".cache\\MatMulNBits_2_0_484.const",
+      "file_size": 57344
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3092504576,
+      "file_name": ".cache\\MatMulNBits_2_0_485.const",
+      "file_size": 29360128
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3121864704,
+      "file_name": ".cache\\MatMulNBits_2_0_486.const",
+      "file_size": 1835008
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3123699712,
+      "file_name": ".cache\\MatMulNBits_2_0_487.const",
+      "file_size": 229376
+    },
+    "model.layers.18.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3123929088,
+      "file_name": ".cache\\MatMulNBits_2_0_488.const",
+      "file_size": 57344
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3123986432,
+      "file_name": ".cache\\MatMulNBits_2_0_489.const",
+      "file_size": 58720256
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3182706688,
+      "file_name": ".cache\\MatMulNBits_2_0_490.const",
+      "file_size": 16384
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3182723072,
+      "file_name": ".cache\\MatMulNBits_2_0_491.const",
+      "file_size": 1835008
+    },
+    "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3184558080,
+      "file_name": ".cache\\MatMulNBits_2_0_492.const",
+      "file_size": 458752
+    },
+    "model.layers.19.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3185016832,
+      "file_name": ".cache\\MatMulNBits_2_0_493.const",
+      "file_size": 8192
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3185025024,
+      "file_name": ".cache\\MatMulNBits_2_0_494.const",
+      "file_size": 20971520
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3205996544,
+      "file_name": ".cache\\MatMulNBits_2_0_495.const",
+      "file_size": 20480
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3206017024,
+      "file_name": ".cache\\MatMulNBits_2_0_496.const",
+      "file_size": 655360
+    },
+    "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3206672384,
+      "file_name": ".cache\\MatMulNBits_2_0_497.const",
+      "file_size": 163840
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3206836224,
+      "file_name": ".cache\\MatMulNBits_2_0_498.const",
+      "file_size": 4194304
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3211030528,
+      "file_name": ".cache\\MatMulNBits_2_0_499.const",
+      "file_size": 4096
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3211034624,
+      "file_name": ".cache\\MatMulNBits_2_0_500.const",
+      "file_size": 131072
+    },
+    "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3211165696,
+      "file_name": ".cache\\MatMulNBits_2_0_501.const",
+      "file_size": 32768
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3211198464,
+      "file_name": ".cache\\MatMulNBits_2_0_502.const",
+      "file_size": 16777216
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3227975680,
+      "file_name": ".cache\\MatMulNBits_2_0_503.const",
+      "file_size": 16384
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3227992064,
+      "file_name": ".cache\\MatMulNBits_2_0_504.const",
+      "file_size": 524288
+    },
+    "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3228516352,
+      "file_name": ".cache\\MatMulNBits_2_0_505.const",
+      "file_size": 131072
+    },
+    "model.layers.19.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3228647424,
+      "file_name": ".cache\\MatMulNBits_2_0_506.const",
+      "file_size": 8192
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3228655616,
+      "file_name": ".cache\\MatMulNBits_2_0_507.const",
+      "file_size": 29360128
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3258015744,
+      "file_name": ".cache\\MatMulNBits_2_0_508.const",
+      "file_size": 1835008
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3259850752,
+      "file_name": ".cache\\MatMulNBits_2_0_509.const",
+      "file_size": 229376
+    },
+    "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3260080128,
+      "file_name": ".cache\\MatMulNBits_2_0_510.const",
+      "file_size": 57344
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3260137472,
+      "file_name": ".cache\\MatMulNBits_2_0_511.const",
+      "file_size": 29360128
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3289497600,
+      "file_name": ".cache\\MatMulNBits_2_0_512.const",
+      "file_size": 1835008
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3291332608,
+      "file_name": ".cache\\MatMulNBits_2_0_513.const",
+      "file_size": 229376
+    },
+    "model.layers.19.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3291561984,
+      "file_name": ".cache\\MatMulNBits_2_0_514.const",
+      "file_size": 57344
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3291619328,
+      "file_name": ".cache\\MatMulNBits_2_0_515.const",
+      "file_size": 58720256
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3350339584,
+      "file_name": ".cache\\MatMulNBits_2_0_516.const",
+      "file_size": 16384
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3350355968,
+      "file_name": ".cache\\MatMulNBits_2_0_517.const",
+      "file_size": 1835008
+    },
+    "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3352190976,
+      "file_name": ".cache\\MatMulNBits_2_0_518.const",
+      "file_size": 458752
+    },
+    "model.layers.20.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3352649728,
+      "file_name": ".cache\\MatMulNBits_2_0_519.const",
+      "file_size": 8192
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3352657920,
+      "file_name": ".cache\\MatMulNBits_2_0_520.const",
+      "file_size": 20971520
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3373629440,
+      "file_name": ".cache\\MatMulNBits_2_0_521.const",
+      "file_size": 20480
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3373649920,
+      "file_name": ".cache\\MatMulNBits_2_0_522.const",
+      "file_size": 655360
+    },
+    "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3374305280,
+      "file_name": ".cache\\MatMulNBits_2_0_523.const",
+      "file_size": 163840
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3374469120,
+      "file_name": ".cache\\MatMulNBits_2_0_524.const",
+      "file_size": 4194304
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3378663424,
+      "file_name": ".cache\\MatMulNBits_2_0_525.const",
+      "file_size": 4096
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3378667520,
+      "file_name": ".cache\\MatMulNBits_2_0_526.const",
+      "file_size": 131072
+    },
+    "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3378798592,
+      "file_name": ".cache\\MatMulNBits_2_0_527.const",
+      "file_size": 32768
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3378831360,
+      "file_name": ".cache\\MatMulNBits_2_0_528.const",
+      "file_size": 16777216
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3395608576,
+      "file_name": ".cache\\MatMulNBits_2_0_529.const",
+      "file_size": 16384
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3395624960,
+      "file_name": ".cache\\MatMulNBits_2_0_530.const",
+      "file_size": 524288
+    },
+    "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3396149248,
+      "file_name": ".cache\\MatMulNBits_2_0_531.const",
+      "file_size": 131072
+    },
+    "model.layers.20.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3396280320,
+      "file_name": ".cache\\MatMulNBits_2_0_532.const",
+      "file_size": 8192
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3396288512,
+      "file_name": ".cache\\MatMulNBits_2_0_533.const",
+      "file_size": 29360128
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3425648640,
+      "file_name": ".cache\\MatMulNBits_2_0_534.const",
+      "file_size": 1835008
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3427483648,
+      "file_name": ".cache\\MatMulNBits_2_0_535.const",
+      "file_size": 229376
+    },
+    "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3427713024,
+      "file_name": ".cache\\MatMulNBits_2_0_536.const",
+      "file_size": 57344
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3427770368,
+      "file_name": ".cache\\MatMulNBits_2_0_537.const",
+      "file_size": 29360128
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3457130496,
+      "file_name": ".cache\\MatMulNBits_2_0_538.const",
+      "file_size": 1835008
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3458965504,
+      "file_name": ".cache\\MatMulNBits_2_0_539.const",
+      "file_size": 229376
+    },
+    "model.layers.20.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3459194880,
+      "file_name": ".cache\\MatMulNBits_2_0_540.const",
+      "file_size": 57344
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3459252224,
+      "file_name": ".cache\\MatMulNBits_2_0_541.const",
+      "file_size": 58720256
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3517972480,
+      "file_name": ".cache\\MatMulNBits_2_0_542.const",
+      "file_size": 16384
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3517988864,
+      "file_name": ".cache\\MatMulNBits_2_0_543.const",
+      "file_size": 1835008
+    },
+    "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3519823872,
+      "file_name": ".cache\\MatMulNBits_2_0_544.const",
+      "file_size": 458752
+    },
+    "model.layers.21.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3520282624,
+      "file_name": ".cache\\MatMulNBits_2_0_545.const",
+      "file_size": 8192
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3520290816,
+      "file_name": ".cache\\MatMulNBits_2_0_546.const",
+      "file_size": 20971520
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3541262336,
+      "file_name": ".cache\\MatMulNBits_2_0_547.const",
+      "file_size": 20480
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3541282816,
+      "file_name": ".cache\\MatMulNBits_2_0_548.const",
+      "file_size": 655360
+    },
+    "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3541938176,
+      "file_name": ".cache\\MatMulNBits_2_0_549.const",
+      "file_size": 163840
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3542102016,
+      "file_name": ".cache\\MatMulNBits_2_0_550.const",
+      "file_size": 4194304
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3546296320,
+      "file_name": ".cache\\MatMulNBits_2_0_551.const",
+      "file_size": 4096
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3546300416,
+      "file_name": ".cache\\MatMulNBits_2_0_552.const",
+      "file_size": 131072
+    },
+    "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3546431488,
+      "file_name": ".cache\\MatMulNBits_2_0_553.const",
+      "file_size": 32768
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3546464256,
+      "file_name": ".cache\\MatMulNBits_2_0_554.const",
+      "file_size": 16777216
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3563241472,
+      "file_name": ".cache\\MatMulNBits_2_0_555.const",
+      "file_size": 16384
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3563257856,
+      "file_name": ".cache\\MatMulNBits_2_0_556.const",
+      "file_size": 524288
+    },
+    "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3563782144,
+      "file_name": ".cache\\MatMulNBits_2_0_557.const",
+      "file_size": 131072
+    },
+    "model.layers.21.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3563913216,
+      "file_name": ".cache\\MatMulNBits_2_0_558.const",
+      "file_size": 8192
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3563921408,
+      "file_name": ".cache\\MatMulNBits_2_0_559.const",
+      "file_size": 29360128
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3593281536,
+      "file_name": ".cache\\MatMulNBits_2_0_560.const",
+      "file_size": 1835008
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3595116544,
+      "file_name": ".cache\\MatMulNBits_2_0_561.const",
+      "file_size": 229376
+    },
+    "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3595345920,
+      "file_name": ".cache\\MatMulNBits_2_0_562.const",
+      "file_size": 57344
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3595403264,
+      "file_name": ".cache\\MatMulNBits_2_0_563.const",
+      "file_size": 29360128
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3624763392,
+      "file_name": ".cache\\MatMulNBits_2_0_564.const",
+      "file_size": 1835008
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3626598400,
+      "file_name": ".cache\\MatMulNBits_2_0_565.const",
+      "file_size": 229376
+    },
+    "model.layers.21.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3626827776,
+      "file_name": ".cache\\MatMulNBits_2_0_566.const",
+      "file_size": 57344
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3626885120,
+      "file_name": ".cache\\MatMulNBits_2_0_567.const",
+      "file_size": 58720256
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3685605376,
+      "file_name": ".cache\\MatMulNBits_2_0_568.const",
+      "file_size": 16384
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3685621760,
+      "file_name": ".cache\\MatMulNBits_2_0_569.const",
+      "file_size": 1835008
+    },
+    "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3687456768,
+      "file_name": ".cache\\MatMulNBits_2_0_570.const",
+      "file_size": 458752
+    },
+    "model.layers.22.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3687915520,
+      "file_name": ".cache\\MatMulNBits_2_0_571.const",
+      "file_size": 8192
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3687923712,
+      "file_name": ".cache\\MatMulNBits_2_0_572.const",
+      "file_size": 20971520
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3708895232,
+      "file_name": ".cache\\MatMulNBits_2_0_573.const",
+      "file_size": 20480
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3708915712,
+      "file_name": ".cache\\MatMulNBits_2_0_574.const",
+      "file_size": 655360
+    },
+    "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3709571072,
+      "file_name": ".cache\\MatMulNBits_2_0_575.const",
+      "file_size": 163840
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3709734912,
+      "file_name": ".cache\\MatMulNBits_2_0_576.const",
+      "file_size": 4194304
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3713929216,
+      "file_name": ".cache\\MatMulNBits_2_0_577.const",
+      "file_size": 4096
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3713933312,
+      "file_name": ".cache\\MatMulNBits_2_0_578.const",
+      "file_size": 131072
+    },
+    "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3714064384,
+      "file_name": ".cache\\MatMulNBits_2_0_579.const",
+      "file_size": 32768
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3714097152,
+      "file_name": ".cache\\MatMulNBits_2_0_580.const",
+      "file_size": 16777216
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3730874368,
+      "file_name": ".cache\\MatMulNBits_2_0_581.const",
+      "file_size": 16384
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3730890752,
+      "file_name": ".cache\\MatMulNBits_2_0_582.const",
+      "file_size": 524288
+    },
+    "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3731415040,
+      "file_name": ".cache\\MatMulNBits_2_0_583.const",
+      "file_size": 131072
+    },
+    "model.layers.22.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3731546112,
+      "file_name": ".cache\\MatMulNBits_2_0_584.const",
+      "file_size": 8192
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3731554304,
+      "file_name": ".cache\\MatMulNBits_2_0_585.const",
+      "file_size": 29360128
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3760914432,
+      "file_name": ".cache\\MatMulNBits_2_0_586.const",
+      "file_size": 1835008
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3762749440,
+      "file_name": ".cache\\MatMulNBits_2_0_587.const",
+      "file_size": 229376
+    },
+    "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3762978816,
+      "file_name": ".cache\\MatMulNBits_2_0_588.const",
+      "file_size": 57344
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3763036160,
+      "file_name": ".cache\\MatMulNBits_2_0_589.const",
+      "file_size": 29360128
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3792396288,
+      "file_name": ".cache\\MatMulNBits_2_0_590.const",
+      "file_size": 1835008
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3794231296,
+      "file_name": ".cache\\MatMulNBits_2_0_591.const",
+      "file_size": 229376
+    },
+    "model.layers.22.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3794460672,
+      "file_name": ".cache\\MatMulNBits_2_0_592.const",
+      "file_size": 57344
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3794518016,
+      "file_name": ".cache\\MatMulNBits_2_0_593.const",
+      "file_size": 58720256
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3853238272,
+      "file_name": ".cache\\MatMulNBits_2_0_594.const",
+      "file_size": 16384
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3853254656,
+      "file_name": ".cache\\MatMulNBits_2_0_595.const",
+      "file_size": 1835008
+    },
+    "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 3855089664,
+      "file_name": ".cache\\MatMulNBits_2_0_596.const",
+      "file_size": 458752
+    },
+    "model.layers.23.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3855548416,
+      "file_name": ".cache\\MatMulNBits_2_0_597.const",
+      "file_size": 8192
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 3855556608,
+      "file_name": ".cache\\MatMulNBits_2_0_598.const",
+      "file_size": 20971520
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 3876528128,
+      "file_name": ".cache\\MatMulNBits_2_0_599.const",
+      "file_size": 20480
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 3876548608,
+      "file_name": ".cache\\MatMulNBits_2_0_600.const",
+      "file_size": 655360
+    },
+    "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 3877203968,
+      "file_name": ".cache\\MatMulNBits_2_0_601.const",
+      "file_size": 163840
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 3877367808,
+      "file_name": ".cache\\MatMulNBits_2_0_602.const",
+      "file_size": 4194304
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 3881562112,
+      "file_name": ".cache\\MatMulNBits_2_0_603.const",
+      "file_size": 4096
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3881566208,
+      "file_name": ".cache\\MatMulNBits_2_0_604.const",
+      "file_size": 131072
+    },
+    "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 3881697280,
+      "file_name": ".cache\\MatMulNBits_2_0_605.const",
+      "file_size": 32768
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 3881730048,
+      "file_name": ".cache\\MatMulNBits_2_0_606.const",
+      "file_size": 16777216
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 3898507264,
+      "file_name": ".cache\\MatMulNBits_2_0_607.const",
+      "file_size": 16384
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 3898523648,
+      "file_name": ".cache\\MatMulNBits_2_0_608.const",
+      "file_size": 524288
+    },
+    "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 3899047936,
+      "file_name": ".cache\\MatMulNBits_2_0_609.const",
+      "file_size": 131072
+    },
+    "model.layers.23.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 3899179008,
+      "file_name": ".cache\\MatMulNBits_2_0_610.const",
+      "file_size": 8192
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3899187200,
+      "file_name": ".cache\\MatMulNBits_2_0_611.const",
+      "file_size": 29360128
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3928547328,
+      "file_name": ".cache\\MatMulNBits_2_0_612.const",
+      "file_size": 1835008
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3930382336,
+      "file_name": ".cache\\MatMulNBits_2_0_613.const",
+      "file_size": 229376
+    },
+    "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3930611712,
+      "file_name": ".cache\\MatMulNBits_2_0_614.const",
+      "file_size": 57344
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 3930669056,
+      "file_name": ".cache\\MatMulNBits_2_0_615.const",
+      "file_size": 29360128
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 3960029184,
+      "file_name": ".cache\\MatMulNBits_2_0_616.const",
+      "file_size": 1835008
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 3961864192,
+      "file_name": ".cache\\MatMulNBits_2_0_617.const",
+      "file_size": 229376
+    },
+    "model.layers.23.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 3962093568,
+      "file_name": ".cache\\MatMulNBits_2_0_618.const",
+      "file_size": 57344
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 3962150912,
+      "file_name": ".cache\\MatMulNBits_2_0_619.const",
+      "file_size": 58720256
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4020871168,
+      "file_name": ".cache\\MatMulNBits_2_0_620.const",
+      "file_size": 16384
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4020887552,
+      "file_name": ".cache\\MatMulNBits_2_0_621.const",
+      "file_size": 1835008
+    },
+    "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4022722560,
+      "file_name": ".cache\\MatMulNBits_2_0_622.const",
+      "file_size": 458752
+    },
+    "model.layers.24.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4023181312,
+      "file_name": ".cache\\MatMulNBits_2_0_623.const",
+      "file_size": 8192
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4023189504,
+      "file_name": ".cache\\MatMulNBits_2_0_624.const",
+      "file_size": 20971520
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4044161024,
+      "file_name": ".cache\\MatMulNBits_2_0_625.const",
+      "file_size": 20480
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4044181504,
+      "file_name": ".cache\\MatMulNBits_2_0_626.const",
+      "file_size": 655360
+    },
+    "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4044836864,
+      "file_name": ".cache\\MatMulNBits_2_0_627.const",
+      "file_size": 163840
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4045000704,
+      "file_name": ".cache\\MatMulNBits_2_0_628.const",
+      "file_size": 4194304
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4049195008,
+      "file_name": ".cache\\MatMulNBits_2_0_629.const",
+      "file_size": 4096
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4049199104,
+      "file_name": ".cache\\MatMulNBits_2_0_630.const",
+      "file_size": 131072
+    },
+    "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4049330176,
+      "file_name": ".cache\\MatMulNBits_2_0_631.const",
+      "file_size": 32768
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4049362944,
+      "file_name": ".cache\\MatMulNBits_2_0_632.const",
+      "file_size": 16777216
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4066140160,
+      "file_name": ".cache\\MatMulNBits_2_0_633.const",
+      "file_size": 16384
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4066156544,
+      "file_name": ".cache\\MatMulNBits_2_0_634.const",
+      "file_size": 524288
+    },
+    "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4066680832,
+      "file_name": ".cache\\MatMulNBits_2_0_635.const",
+      "file_size": 131072
+    },
+    "model.layers.24.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4066811904,
+      "file_name": ".cache\\MatMulNBits_2_0_636.const",
+      "file_size": 8192
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4066820096,
+      "file_name": ".cache\\MatMulNBits_2_0_637.const",
+      "file_size": 29360128
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4096180224,
+      "file_name": ".cache\\MatMulNBits_2_0_638.const",
+      "file_size": 1835008
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4098015232,
+      "file_name": ".cache\\MatMulNBits_2_0_639.const",
+      "file_size": 229376
+    },
+    "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4098244608,
+      "file_name": ".cache\\MatMulNBits_2_0_640.const",
+      "file_size": 57344
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4098301952,
+      "file_name": ".cache\\MatMulNBits_2_0_641.const",
+      "file_size": 29360128
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4127662080,
+      "file_name": ".cache\\MatMulNBits_2_0_642.const",
+      "file_size": 1835008
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4129497088,
+      "file_name": ".cache\\MatMulNBits_2_0_643.const",
+      "file_size": 229376
+    },
+    "model.layers.24.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4129726464,
+      "file_name": ".cache\\MatMulNBits_2_0_644.const",
+      "file_size": 57344
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4129783808,
+      "file_name": ".cache\\MatMulNBits_2_0_645.const",
+      "file_size": 58720256
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4188504064,
+      "file_name": ".cache\\MatMulNBits_2_0_646.const",
+      "file_size": 16384
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4188520448,
+      "file_name": ".cache\\MatMulNBits_2_0_647.const",
+      "file_size": 1835008
+    },
+    "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4190355456,
+      "file_name": ".cache\\MatMulNBits_2_0_648.const",
+      "file_size": 458752
+    },
+    "model.layers.25.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4190814208,
+      "file_name": ".cache\\MatMulNBits_2_0_649.const",
+      "file_size": 8192
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4190822400,
+      "file_name": ".cache\\MatMulNBits_2_0_650.const",
+      "file_size": 20971520
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4211793920,
+      "file_name": ".cache\\MatMulNBits_2_0_651.const",
+      "file_size": 20480
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4211814400,
+      "file_name": ".cache\\MatMulNBits_2_0_652.const",
+      "file_size": 655360
+    },
+    "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4212469760,
+      "file_name": ".cache\\MatMulNBits_2_0_653.const",
+      "file_size": 163840
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4212633600,
+      "file_name": ".cache\\MatMulNBits_2_0_654.const",
+      "file_size": 4194304
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4216827904,
+      "file_name": ".cache\\MatMulNBits_2_0_655.const",
+      "file_size": 4096
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4216832000,
+      "file_name": ".cache\\MatMulNBits_2_0_656.const",
+      "file_size": 131072
+    },
+    "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4216963072,
+      "file_name": ".cache\\MatMulNBits_2_0_657.const",
+      "file_size": 32768
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4216995840,
+      "file_name": ".cache\\MatMulNBits_2_0_658.const",
+      "file_size": 16777216
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4233773056,
+      "file_name": ".cache\\MatMulNBits_2_0_659.const",
+      "file_size": 16384
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4233789440,
+      "file_name": ".cache\\MatMulNBits_2_0_660.const",
+      "file_size": 524288
+    },
+    "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4234313728,
+      "file_name": ".cache\\MatMulNBits_2_0_661.const",
+      "file_size": 131072
+    },
+    "model.layers.25.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4234444800,
+      "file_name": ".cache\\MatMulNBits_2_0_662.const",
+      "file_size": 8192
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4234452992,
+      "file_name": ".cache\\MatMulNBits_2_0_663.const",
+      "file_size": 29360128
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4263813120,
+      "file_name": ".cache\\MatMulNBits_2_0_664.const",
+      "file_size": 1835008
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4265648128,
+      "file_name": ".cache\\MatMulNBits_2_0_665.const",
+      "file_size": 229376
+    },
+    "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4265877504,
+      "file_name": ".cache\\MatMulNBits_2_0_666.const",
+      "file_size": 57344
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4265934848,
+      "file_name": ".cache\\MatMulNBits_2_0_667.const",
+      "file_size": 29360128
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4295294976,
+      "file_name": ".cache\\MatMulNBits_2_0_668.const",
+      "file_size": 1835008
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4297129984,
+      "file_name": ".cache\\MatMulNBits_2_0_669.const",
+      "file_size": 229376
+    },
+    "model.layers.25.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4297359360,
+      "file_name": ".cache\\MatMulNBits_2_0_670.const",
+      "file_size": 57344
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4297416704,
+      "file_name": ".cache\\MatMulNBits_2_0_671.const",
+      "file_size": 58720256
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4356136960,
+      "file_name": ".cache\\MatMulNBits_2_0_672.const",
+      "file_size": 16384
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4356153344,
+      "file_name": ".cache\\MatMulNBits_2_0_673.const",
+      "file_size": 1835008
+    },
+    "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4357988352,
+      "file_name": ".cache\\MatMulNBits_2_0_674.const",
+      "file_size": 458752
+    },
+    "model.layers.26.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4358447104,
+      "file_name": ".cache\\MatMulNBits_2_0_675.const",
+      "file_size": 8192
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4358455296,
+      "file_name": ".cache\\MatMulNBits_2_0_676.const",
+      "file_size": 20971520
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4379426816,
+      "file_name": ".cache\\MatMulNBits_2_0_677.const",
+      "file_size": 20480
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4379447296,
+      "file_name": ".cache\\MatMulNBits_2_0_678.const",
+      "file_size": 655360
+    },
+    "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4380102656,
+      "file_name": ".cache\\MatMulNBits_2_0_679.const",
+      "file_size": 163840
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4380266496,
+      "file_name": ".cache\\MatMulNBits_2_0_680.const",
+      "file_size": 4194304
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4384460800,
+      "file_name": ".cache\\MatMulNBits_2_0_681.const",
+      "file_size": 4096
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4384464896,
+      "file_name": ".cache\\MatMulNBits_2_0_682.const",
+      "file_size": 131072
+    },
+    "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4384595968,
+      "file_name": ".cache\\MatMulNBits_2_0_683.const",
+      "file_size": 32768
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4384628736,
+      "file_name": ".cache\\MatMulNBits_2_0_684.const",
+      "file_size": 16777216
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4401405952,
+      "file_name": ".cache\\MatMulNBits_2_0_685.const",
+      "file_size": 16384
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4401422336,
+      "file_name": ".cache\\MatMulNBits_2_0_686.const",
+      "file_size": 524288
+    },
+    "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4401946624,
+      "file_name": ".cache\\MatMulNBits_2_0_687.const",
+      "file_size": 131072
+    },
+    "model.layers.26.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4402077696,
+      "file_name": ".cache\\MatMulNBits_2_0_688.const",
+      "file_size": 8192
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4402085888,
+      "file_name": ".cache\\MatMulNBits_2_0_689.const",
+      "file_size": 29360128
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4431446016,
+      "file_name": ".cache\\MatMulNBits_2_0_690.const",
+      "file_size": 1835008
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4433281024,
+      "file_name": ".cache\\MatMulNBits_2_0_691.const",
+      "file_size": 229376
+    },
+    "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4433510400,
+      "file_name": ".cache\\MatMulNBits_2_0_692.const",
+      "file_size": 57344
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4433567744,
+      "file_name": ".cache\\MatMulNBits_2_0_693.const",
+      "file_size": 29360128
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4462927872,
+      "file_name": ".cache\\MatMulNBits_2_0_694.const",
+      "file_size": 1835008
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4464762880,
+      "file_name": ".cache\\MatMulNBits_2_0_695.const",
+      "file_size": 229376
+    },
+    "model.layers.26.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4464992256,
+      "file_name": ".cache\\MatMulNBits_2_0_696.const",
+      "file_size": 57344
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4465049600,
+      "file_name": ".cache\\MatMulNBits_2_0_697.const",
+      "file_size": 58720256
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4523769856,
+      "file_name": ".cache\\MatMulNBits_2_0_698.const",
+      "file_size": 16384
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4523786240,
+      "file_name": ".cache\\MatMulNBits_2_0_699.const",
+      "file_size": 1835008
+    },
+    "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4525621248,
+      "file_name": ".cache\\MatMulNBits_2_0_700.const",
+      "file_size": 458752
+    },
+    "model.layers.27.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4526080000,
+      "file_name": ".cache\\MatMulNBits_2_0_701.const",
+      "file_size": 8192
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4526088192,
+      "file_name": ".cache\\MatMulNBits_2_0_702.const",
+      "file_size": 20971520
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4547059712,
+      "file_name": ".cache\\MatMulNBits_2_0_703.const",
+      "file_size": 20480
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4547080192,
+      "file_name": ".cache\\MatMulNBits_2_0_704.const",
+      "file_size": 655360
+    },
+    "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4547735552,
+      "file_name": ".cache\\MatMulNBits_2_0_705.const",
+      "file_size": 163840
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4547899392,
+      "file_name": ".cache\\MatMulNBits_2_0_706.const",
+      "file_size": 4194304
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4552093696,
+      "file_name": ".cache\\MatMulNBits_2_0_707.const",
+      "file_size": 4096
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4552097792,
+      "file_name": ".cache\\MatMulNBits_2_0_708.const",
+      "file_size": 131072
+    },
+    "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4552228864,
+      "file_name": ".cache\\MatMulNBits_2_0_709.const",
+      "file_size": 32768
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4552261632,
+      "file_name": ".cache\\MatMulNBits_2_0_710.const",
+      "file_size": 16777216
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4569038848,
+      "file_name": ".cache\\MatMulNBits_2_0_711.const",
+      "file_size": 16384
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4569055232,
+      "file_name": ".cache\\MatMulNBits_2_0_712.const",
+      "file_size": 524288
+    },
+    "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4569579520,
+      "file_name": ".cache\\MatMulNBits_2_0_713.const",
+      "file_size": 131072
+    },
+    "model.layers.27.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4569710592,
+      "file_name": ".cache\\MatMulNBits_2_0_714.const",
+      "file_size": 8192
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4569718784,
+      "file_name": ".cache\\MatMulNBits_2_0_715.const",
+      "file_size": 29360128
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4599078912,
+      "file_name": ".cache\\MatMulNBits_2_0_716.const",
+      "file_size": 1835008
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4600913920,
+      "file_name": ".cache\\MatMulNBits_2_0_717.const",
+      "file_size": 229376
+    },
+    "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4601143296,
+      "file_name": ".cache\\MatMulNBits_2_0_718.const",
+      "file_size": 57344
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4601200640,
+      "file_name": ".cache\\MatMulNBits_2_0_719.const",
+      "file_size": 29360128
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4630560768,
+      "file_name": ".cache\\MatMulNBits_2_0_720.const",
+      "file_size": 1835008
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4632395776,
+      "file_name": ".cache\\MatMulNBits_2_0_721.const",
+      "file_size": 229376
+    },
+    "model.layers.27.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4632625152,
+      "file_name": ".cache\\MatMulNBits_2_0_722.const",
+      "file_size": 57344
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4632682496,
+      "file_name": ".cache\\MatMulNBits_2_0_723.const",
+      "file_size": 58720256
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4691402752,
+      "file_name": ".cache\\MatMulNBits_2_0_724.const",
+      "file_size": 16384
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4691419136,
+      "file_name": ".cache\\MatMulNBits_2_0_725.const",
+      "file_size": 1835008
+    },
+    "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4693254144,
+      "file_name": ".cache\\MatMulNBits_2_0_726.const",
+      "file_size": 458752
+    },
+    "model.layers.28.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4693712896,
+      "file_name": ".cache\\MatMulNBits_2_0_727.const",
+      "file_size": 8192
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4693721088,
+      "file_name": ".cache\\MatMulNBits_2_0_728.const",
+      "file_size": 20971520
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4714692608,
+      "file_name": ".cache\\MatMulNBits_2_0_729.const",
+      "file_size": 20480
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4714713088,
+      "file_name": ".cache\\MatMulNBits_2_0_730.const",
+      "file_size": 655360
+    },
+    "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4715368448,
+      "file_name": ".cache\\MatMulNBits_2_0_731.const",
+      "file_size": 163840
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4715532288,
+      "file_name": ".cache\\MatMulNBits_2_0_732.const",
+      "file_size": 4194304
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4719726592,
+      "file_name": ".cache\\MatMulNBits_2_0_733.const",
+      "file_size": 4096
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4719730688,
+      "file_name": ".cache\\MatMulNBits_2_0_734.const",
+      "file_size": 131072
+    },
+    "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4719861760,
+      "file_name": ".cache\\MatMulNBits_2_0_735.const",
+      "file_size": 32768
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4719894528,
+      "file_name": ".cache\\MatMulNBits_2_0_736.const",
+      "file_size": 16777216
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4736671744,
+      "file_name": ".cache\\MatMulNBits_2_0_737.const",
+      "file_size": 16384
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4736688128,
+      "file_name": ".cache\\MatMulNBits_2_0_738.const",
+      "file_size": 524288
+    },
+    "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4737212416,
+      "file_name": ".cache\\MatMulNBits_2_0_739.const",
+      "file_size": 131072
+    },
+    "model.layers.28.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4737343488,
+      "file_name": ".cache\\MatMulNBits_2_0_740.const",
+      "file_size": 8192
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4737351680,
+      "file_name": ".cache\\MatMulNBits_2_0_741.const",
+      "file_size": 29360128
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4766711808,
+      "file_name": ".cache\\MatMulNBits_2_0_742.const",
+      "file_size": 1835008
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4768546816,
+      "file_name": ".cache\\MatMulNBits_2_0_743.const",
+      "file_size": 229376
+    },
+    "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4768776192,
+      "file_name": ".cache\\MatMulNBits_2_0_744.const",
+      "file_size": 57344
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4768833536,
+      "file_name": ".cache\\MatMulNBits_2_0_745.const",
+      "file_size": 29360128
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4798193664,
+      "file_name": ".cache\\MatMulNBits_2_0_746.const",
+      "file_size": 1835008
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4800028672,
+      "file_name": ".cache\\MatMulNBits_2_0_747.const",
+      "file_size": 229376
+    },
+    "model.layers.28.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4800258048,
+      "file_name": ".cache\\MatMulNBits_2_0_748.const",
+      "file_size": 57344
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4800315392,
+      "file_name": ".cache\\MatMulNBits_2_0_749.const",
+      "file_size": 58720256
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4859035648,
+      "file_name": ".cache\\MatMulNBits_2_0_750.const",
+      "file_size": 16384
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4859052032,
+      "file_name": ".cache\\MatMulNBits_2_0_751.const",
+      "file_size": 1835008
+    },
+    "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 4860887040,
+      "file_name": ".cache\\MatMulNBits_2_0_752.const",
+      "file_size": 458752
+    },
+    "model.layers.29.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4861345792,
+      "file_name": ".cache\\MatMulNBits_2_0_753.const",
+      "file_size": 8192
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 4861353984,
+      "file_name": ".cache\\MatMulNBits_2_0_754.const",
+      "file_size": 20971520
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 4882325504,
+      "file_name": ".cache\\MatMulNBits_2_0_755.const",
+      "file_size": 20480
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 4882345984,
+      "file_name": ".cache\\MatMulNBits_2_0_756.const",
+      "file_size": 655360
+    },
+    "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 4883001344,
+      "file_name": ".cache\\MatMulNBits_2_0_757.const",
+      "file_size": 163840
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 4883165184,
+      "file_name": ".cache\\MatMulNBits_2_0_758.const",
+      "file_size": 4194304
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 4887359488,
+      "file_name": ".cache\\MatMulNBits_2_0_759.const",
+      "file_size": 4096
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4887363584,
+      "file_name": ".cache\\MatMulNBits_2_0_760.const",
+      "file_size": 131072
+    },
+    "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 4887494656,
+      "file_name": ".cache\\MatMulNBits_2_0_761.const",
+      "file_size": 32768
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 4887527424,
+      "file_name": ".cache\\MatMulNBits_2_0_762.const",
+      "file_size": 16777216
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 4904304640,
+      "file_name": ".cache\\MatMulNBits_2_0_763.const",
+      "file_size": 16384
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 4904321024,
+      "file_name": ".cache\\MatMulNBits_2_0_764.const",
+      "file_size": 524288
+    },
+    "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 4904845312,
+      "file_name": ".cache\\MatMulNBits_2_0_765.const",
+      "file_size": 131072
+    },
+    "model.layers.29.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 4904976384,
+      "file_name": ".cache\\MatMulNBits_2_0_766.const",
+      "file_size": 8192
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4904984576,
+      "file_name": ".cache\\MatMulNBits_2_0_767.const",
+      "file_size": 29360128
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4934344704,
+      "file_name": ".cache\\MatMulNBits_2_0_768.const",
+      "file_size": 1835008
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4936179712,
+      "file_name": ".cache\\MatMulNBits_2_0_769.const",
+      "file_size": 229376
+    },
+    "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4936409088,
+      "file_name": ".cache\\MatMulNBits_2_0_770.const",
+      "file_size": 57344
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 4936466432,
+      "file_name": ".cache\\MatMulNBits_2_0_771.const",
+      "file_size": 29360128
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 4965826560,
+      "file_name": ".cache\\MatMulNBits_2_0_772.const",
+      "file_size": 1835008
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 4967661568,
+      "file_name": ".cache\\MatMulNBits_2_0_773.const",
+      "file_size": 229376
+    },
+    "model.layers.29.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 4967890944,
+      "file_name": ".cache\\MatMulNBits_2_0_774.const",
+      "file_size": 57344
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 4967948288,
+      "file_name": ".cache\\MatMulNBits_2_0_775.const",
+      "file_size": 58720256
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 5026668544,
+      "file_name": ".cache\\MatMulNBits_2_0_776.const",
+      "file_size": 16384
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5026684928,
+      "file_name": ".cache\\MatMulNBits_2_0_777.const",
+      "file_size": 1835008
+    },
+    "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 5028519936,
+      "file_name": ".cache\\MatMulNBits_2_0_778.const",
+      "file_size": 458752
+    },
+    "model.layers.30.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 5028978688,
+      "file_name": ".cache\\MatMulNBits_2_0_779.const",
+      "file_size": 8192
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 5028986880,
+      "file_name": ".cache\\MatMulNBits_2_0_780.const",
+      "file_size": 20971520
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 5049958400,
+      "file_name": ".cache\\MatMulNBits_2_0_781.const",
+      "file_size": 20480
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 5049978880,
+      "file_name": ".cache\\MatMulNBits_2_0_782.const",
+      "file_size": 655360
+    },
+    "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 5050634240,
+      "file_name": ".cache\\MatMulNBits_2_0_783.const",
+      "file_size": 163840
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 5050798080,
+      "file_name": ".cache\\MatMulNBits_2_0_784.const",
+      "file_size": 4194304
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 5054992384,
+      "file_name": ".cache\\MatMulNBits_2_0_785.const",
+      "file_size": 4096
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 5054996480,
+      "file_name": ".cache\\MatMulNBits_2_0_786.const",
+      "file_size": 131072
+    },
+    "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 5055127552,
+      "file_name": ".cache\\MatMulNBits_2_0_787.const",
+      "file_size": 32768
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 5055160320,
+      "file_name": ".cache\\MatMulNBits_2_0_788.const",
+      "file_size": 16777216
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 5071937536,
+      "file_name": ".cache\\MatMulNBits_2_0_789.const",
+      "file_size": 16384
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 5071953920,
+      "file_name": ".cache\\MatMulNBits_2_0_790.const",
+      "file_size": 524288
+    },
+    "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 5072478208,
+      "file_name": ".cache\\MatMulNBits_2_0_791.const",
+      "file_size": 131072
+    },
+    "model.layers.30.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 5072609280,
+      "file_name": ".cache\\MatMulNBits_2_0_792.const",
+      "file_size": 8192
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 5072617472,
+      "file_name": ".cache\\MatMulNBits_2_0_793.const",
+      "file_size": 29360128
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5101977600,
+      "file_name": ".cache\\MatMulNBits_2_0_794.const",
+      "file_size": 1835008
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 5103812608,
+      "file_name": ".cache\\MatMulNBits_2_0_795.const",
+      "file_size": 229376
+    },
+    "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 5104041984,
+      "file_name": ".cache\\MatMulNBits_2_0_796.const",
+      "file_size": 57344
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 5104099328,
+      "file_name": ".cache\\MatMulNBits_2_0_797.const",
+      "file_size": 29360128
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5133459456,
+      "file_name": ".cache\\MatMulNBits_2_0_798.const",
+      "file_size": 1835008
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 5135294464,
+      "file_name": ".cache\\MatMulNBits_2_0_799.const",
+      "file_size": 229376
+    },
+    "model.layers.30.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 5135523840,
+      "file_name": ".cache\\MatMulNBits_2_0_800.const",
+      "file_size": 57344
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 5135581184,
+      "file_name": ".cache\\MatMulNBits_2_0_801.const",
+      "file_size": 58720256
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 5194301440,
+      "file_name": ".cache\\MatMulNBits_2_0_802.const",
+      "file_size": 16384
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5194317824,
+      "file_name": ".cache\\MatMulNBits_2_0_803.const",
+      "file_size": 1835008
+    },
+    "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 5196152832,
+      "file_name": ".cache\\MatMulNBits_2_0_804.const",
+      "file_size": 458752
+    },
+    "model.layers.31.input_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 5196611584,
+      "file_name": ".cache\\MatMulNBits_2_0_805.const",
+      "file_size": 8192
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        5120
+      ],
+      "size_in_bytes": 20971520,
+      "op_tensor_size": 20971520,
+      "offset": 5196619776,
+      "file_name": ".cache\\MatMulNBits_2_0_806.const",
+      "file_size": 20971520
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        5120
+      ],
+      "size_in_bytes": 20480,
+      "op_tensor_size": 20480,
+      "offset": 5217591296,
+      "file_name": ".cache\\MatMulNBits_2_0_807.const",
+      "file_size": 20480
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 655360,
+      "op_tensor_size": 655360,
+      "offset": 5217611776,
+      "file_name": ".cache\\MatMulNBits_2_0_808.const",
+      "file_size": 655360
+    },
+    "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        163840
+      ],
+      "size_in_bytes": 163840,
+      "op_tensor_size": 163840,
+      "offset": 5218267136,
+      "file_name": ".cache\\MatMulNBits_2_0_809.const",
+      "file_size": 163840
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        1024
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 5218430976,
+      "file_name": ".cache\\MatMulNBits_2_0_810.const",
+      "file_size": 4194304
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1024
+      ],
+      "size_in_bytes": 4096,
+      "op_tensor_size": 4096,
+      "offset": 5222625280,
+      "file_name": ".cache\\MatMulNBits_2_0_811.const",
+      "file_size": 4096
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 5222629376,
+      "file_name": ".cache\\MatMulNBits_2_0_812.const",
+      "file_size": 131072
+    },
+    "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 32768,
+      "op_tensor_size": 32768,
+      "offset": 5222760448,
+      "file_name": ".cache\\MatMulNBits_2_0_813.const",
+      "file_size": 32768
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        4096
+      ],
+      "size_in_bytes": 16777216,
+      "op_tensor_size": 16777216,
+      "offset": 5222793216,
+      "file_name": ".cache\\MatMulNBits_2_0_814.const",
+      "file_size": 16777216
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 5239570432,
+      "file_name": ".cache\\MatMulNBits_2_0_815.const",
+      "file_size": 16384
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 524288,
+      "op_tensor_size": 524288,
+      "offset": 5239586816,
+      "file_name": ".cache\\MatMulNBits_2_0_816.const",
+      "file_size": 524288
+    },
+    "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        131072
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 5240111104,
+      "file_name": ".cache\\MatMulNBits_2_0_817.const",
+      "file_size": 131072
+    },
+    "model.layers.31.post_attention_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 5240242176,
+      "file_name": ".cache\\MatMulNBits_2_0_818.const",
+      "file_size": 8192
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 5240250368,
+      "file_name": ".cache\\MatMulNBits_2_0_819.const",
+      "file_size": 29360128
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5269610496,
+      "file_name": ".cache\\MatMulNBits_2_0_820.const",
+      "file_size": 1835008
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 5271445504,
+      "file_name": ".cache\\MatMulNBits_2_0_821.const",
+      "file_size": 229376
+    },
+    "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 5271674880,
+      "file_name": ".cache\\MatMulNBits_2_0_822.const",
+      "file_size": 57344
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.qweight": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        14336,
+        32,
+        64
+      ],
+      "size_in_bytes": 29360128,
+      "op_tensor_size": 29360128,
+      "offset": 5271732224,
+      "file_name": ".cache\\MatMulNBits_2_0_823.const",
+      "file_size": 29360128
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.scales.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5301092352,
+      "file_name": ".cache\\MatMulNBits_2_0_824.const",
+      "file_size": 1835008
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.qzeros": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "uint8",
+      "shape": [
+        229376
+      ],
+      "size_in_bytes": 229376,
+      "op_tensor_size": 229376,
+      "offset": 5302927360,
+      "file_name": ".cache\\MatMulNBits_2_0_825.const",
+      "file_size": 229376
+    },
+    "model.layers.31.mlp.up_proj.MatMulNBits.bias.f": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        14336
+      ],
+      "size_in_bytes": 57344,
+      "op_tensor_size": 57344,
+      "offset": 5303156736,
+      "file_name": ".cache\\MatMulNBits_2_0_826.const",
+      "file_size": 57344
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        14336,
+        4096
+      ],
+      "size_in_bytes": 58720256,
+      "op_tensor_size": 58720256,
+      "offset": 5303214080,
+      "file_name": ".cache\\MatMulNBits_2_0_827.const",
+      "file_size": 58720256
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 16384,
+      "op_tensor_size": 16384,
+      "offset": 5361934336,
+      "file_name": ".cache\\MatMulNBits_2_0_828.const",
+      "file_size": 16384
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 1835008,
+      "op_tensor_size": 1835008,
+      "offset": 5361950720,
+      "file_name": ".cache\\MatMulNBits_2_0_829.const",
+      "file_size": 1835008
+    },
+    "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        458752
+      ],
+      "size_in_bytes": 458752,
+      "op_tensor_size": 458752,
+      "offset": 5363785728,
+      "file_name": ".cache\\MatMulNBits_2_0_830.const",
+      "file_size": 458752
+    },
+    "model.layers.32.final_norm_layernorm.weight.bf": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "bfloat16",
+      "shape": [
+        4096
+      ],
+      "size_in_bytes": 8192,
+      "op_tensor_size": 8192,
+      "offset": 5364244480,
+      "file_name": ".cache\\MatMulNBits_2_0_831.const",
+      "file_size": 8192
+    },
+    "lm_head.MatMulNBits.qweight.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        4096,
+        32768
+      ],
+      "size_in_bytes": 134217728,
+      "op_tensor_size": 134217728,
+      "offset": 5364252672,
+      "file_name": ".cache\\MatMulNBits_2_0_832.const",
+      "file_size": 134217728
+    },
+    "lm_head.MatMulNBits.bias.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        32768
+      ],
+      "size_in_bytes": 131072,
+      "op_tensor_size": 131072,
+      "offset": 5498470400,
+      "file_name": ".cache\\MatMulNBits_2_0_833.const",
+      "file_size": 131072
+    },
+    "lm_head.MatMulNBits.scales.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "float",
+      "shape": [
+        1048576
+      ],
+      "size_in_bytes": 4194304,
+      "op_tensor_size": 4194304,
+      "offset": 5498601472,
+      "file_name": ".cache\\MatMulNBits_2_0_834.const",
+      "file_size": 4194304
+    },
+    "lm_head.MatMulNBits.qzeros.preformat": {
+      "packed_buffer_label": "const",
+      "xrt_arg_id": 3,
+      "dtype": "int8",
+      "shape": [
+        1048576
+      ],
+      "size_in_bytes": 1048576,
+      "op_tensor_size": 1048576,
+      "offset": 5502795776,
+      "file_name": ".cache\\MatMulNBits_2_0_835.const",
+      "file_size": 1048576
+    },
+    "past_key_values.0.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 0
+    },
+    "past_key_values.0.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 8388608
+    },
+    "present.0.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 0
+    },
+    "present.0.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 8388608
+    },
+    "past_key_values.1.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 16777216
+    },
+    "past_key_values.1.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 25165824
+    },
+    "present.1.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 16777216
+    },
+    "present.1.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 25165824
+    },
+    "past_key_values.2.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 33554432
+    },
+    "past_key_values.2.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 41943040
+    },
+    "present.2.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 33554432
+    },
+    "present.2.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 41943040
+    },
+    "past_key_values.3.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 50331648
+    },
+    "past_key_values.3.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 58720256
+    },
+    "present.3.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 50331648
+    },
+    "present.3.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 58720256
+    },
+    "past_key_values.4.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 67108864
+    },
+    "past_key_values.4.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 75497472
+    },
+    "present.4.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 67108864
+    },
+    "present.4.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 75497472
+    },
+    "past_key_values.5.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 83886080
+    },
+    "past_key_values.5.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 92274688
+    },
+    "present.5.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 83886080
+    },
+    "present.5.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 92274688
+    },
+    "past_key_values.6.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 100663296
+    },
+    "past_key_values.6.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 109051904
+    },
+    "present.6.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 100663296
+    },
+    "present.6.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 109051904
+    },
+    "past_key_values.7.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 117440512
+    },
+    "past_key_values.7.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 125829120
+    },
+    "present.7.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 117440512
+    },
+    "present.7.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 125829120
+    },
+    "past_key_values.8.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 134217728
+    },
+    "past_key_values.8.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 142606336
+    },
+    "present.8.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 134217728
+    },
+    "present.8.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 142606336
+    },
+    "past_key_values.9.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 150994944
+    },
+    "past_key_values.9.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 159383552
+    },
+    "present.9.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 150994944
+    },
+    "present.9.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 159383552
+    },
+    "past_key_values.10.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 167772160
+    },
+    "past_key_values.10.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 176160768
+    },
+    "present.10.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 167772160
+    },
+    "present.10.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 176160768
+    },
+    "past_key_values.11.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 184549376
+    },
+    "past_key_values.11.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 192937984
+    },
+    "present.11.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 184549376
+    },
+    "present.11.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 192937984
+    },
+    "past_key_values.12.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 201326592
+    },
+    "past_key_values.12.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 209715200
+    },
+    "present.12.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 201326592
+    },
+    "present.12.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 209715200
+    },
+    "past_key_values.13.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 218103808
+    },
+    "past_key_values.13.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 226492416
+    },
+    "present.13.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 218103808
+    },
+    "present.13.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 226492416
+    },
+    "past_key_values.14.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 234881024
+    },
+    "past_key_values.14.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 243269632
+    },
+    "present.14.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 234881024
+    },
+    "present.14.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 243269632
+    },
+    "past_key_values.15.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 251658240
+    },
+    "past_key_values.15.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 260046848
+    },
+    "present.15.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 251658240
+    },
+    "present.15.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 260046848
+    },
+    "past_key_values.16.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 268435456
+    },
+    "past_key_values.16.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 276824064
+    },
+    "present.16.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 268435456
+    },
+    "present.16.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 276824064
+    },
+    "past_key_values.17.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 285212672
+    },
+    "past_key_values.17.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 293601280
+    },
+    "present.17.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 285212672
+    },
+    "present.17.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 293601280
+    },
+    "past_key_values.18.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 301989888
+    },
+    "past_key_values.18.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 310378496
+    },
+    "present.18.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 301989888
+    },
+    "present.18.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 310378496
+    },
+    "past_key_values.19.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 318767104
+    },
+    "past_key_values.19.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 327155712
+    },
+    "present.19.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 318767104
+    },
+    "present.19.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 327155712
+    },
+    "past_key_values.20.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 335544320
+    },
+    "past_key_values.20.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 343932928
+    },
+    "present.20.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 335544320
+    },
+    "present.20.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 343932928
+    },
+    "past_key_values.21.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 352321536
+    },
+    "past_key_values.21.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 360710144
+    },
+    "present.21.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 352321536
+    },
+    "present.21.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 360710144
+    },
+    "past_key_values.22.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 369098752
+    },
+    "past_key_values.22.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 377487360
+    },
+    "present.22.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 369098752
+    },
+    "present.22.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 377487360
+    },
+    "past_key_values.23.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 385875968
+    },
+    "past_key_values.23.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 394264576
+    },
+    "present.23.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 385875968
+    },
+    "present.23.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 394264576
+    },
+    "past_key_values.24.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 402653184
+    },
+    "past_key_values.24.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 411041792
+    },
+    "present.24.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 402653184
+    },
+    "present.24.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 411041792
+    },
+    "past_key_values.25.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 419430400
+    },
+    "past_key_values.25.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 427819008
+    },
+    "present.25.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 419430400
+    },
+    "present.25.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 427819008
+    },
+    "past_key_values.26.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 436207616
+    },
+    "past_key_values.26.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 444596224
+    },
+    "present.26.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 436207616
+    },
+    "present.26.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 444596224
+    },
+    "past_key_values.27.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 452984832
+    },
+    "past_key_values.27.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 461373440
+    },
+    "present.27.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 452984832
+    },
+    "present.27.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 461373440
+    },
+    "past_key_values.28.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 469762048
+    },
+    "past_key_values.28.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 478150656
+    },
+    "present.28.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 469762048
+    },
+    "present.28.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 478150656
+    },
+    "past_key_values.29.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 486539264
+    },
+    "past_key_values.29.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 494927872
+    },
+    "present.29.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 486539264
+    },
+    "present.29.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 494927872
+    },
+    "past_key_values.30.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 503316480
+    },
+    "past_key_values.30.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 511705088
+    },
+    "present.30.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 503316480
+    },
+    "present.30.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 511705088
+    },
+    "past_key_values.31.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 520093696
+    },
+    "past_key_values.31.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 528482304
+    },
+    "present.31.key": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 520093696
+    },
+    "present.31.value": {
+      "packed_buffer_label": "ext_buf_0",
+      "xrt_arg_id": 5,
+      "dtype": "bfloat16",
+      "shape": [
+        1,
+        8,
+        4096,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 528482304
+    },
+    "sin_cos_cache_token": {
+      "packed_buffer_label": "ext_buf_1",
+      "xrt_arg_id": 6,
+      "dtype": "bfloat16",
+      "shape": [
+        32768,
+        128
+      ],
+      "size_in_bytes": 8388608,
+      "op_tensor_size": 8388608,
+      "offset": 0
+    }
+  },
+  "aux_info": {
+    "is_llm": true
+  }
+}
\ No newline at end of file