diff --git "a/.cache/MatMulNBits_2_0_meta.json" "b/.cache/MatMulNBits_2_0_meta.json" new file mode 100644--- /dev/null +++ "b/.cache/MatMulNBits_2_0_meta.json" @@ -0,0 +1,36155 @@ +{ + "dd_meta_major_version": 1, + "dd_meta_minor_version": 4, + "state_table_updates": [ + { + "state_table_idx": 0, + "update_func": 1, + "update_arg": 1 + } + ], + "op_list": [ + { + "name": "MatMulNBits_2_0", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.0/input_layernorm/output_0.out5_4_0" + ], + "const_args": [ + "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.0.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.0/input_layernorm/output_0.out5_4_0" + ], + "const_args": [ + "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.0.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "3", + "1" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.0/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0", + "past_key_values.0.key", + "past_key_values.0.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0", + "present.0.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "0", + "0", + "2", + "0", + "1", + "1", + "6", + "0", + "2", + "0" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.0.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0" + ], + "const_args": [ + "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_0", + "type": "FlatRMSAdd", + "in_args": [ + "/model/embed_tokens/Gather/output_0.out4_0", + "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1" + ], + "const_args": [ + "model.layers.0.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.0/post_attention_layernorm/output_3.out4_0", + "/model/layers.0/post_attention_layernorm/output_0.out4_0" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_0", + "type": "FlatMLP", + "in_args": [ + "/model/layers.0/post_attention_layernorm/output_0.out4_0" + ], + "const_args": [ + "model.layers.0.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.0.mlp.up_proj.MatMulNBits.qweight", + "model.layers.0.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.0.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.0.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.0/mlp/Mul/output_0.out3_0" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.0.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.0/mlp/Mul/output_0.out3_0" + ], + "const_args": [ + "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_1", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.0/post_attention_layernorm/output_3.out4_0", + "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2" + ], + "const_args": [ + "model.layers.1.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.1/input_layernorm/output_3.out4_1", + "/model/layers.1/input_layernorm/output_0.out4_1" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_1", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.1/input_layernorm/output_0.out4_1" + ], + "const_args": [ + "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.1.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.1/input_layernorm/output_0.out4_1" + ], + "const_args": [ + "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.1.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "7", + "3" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.1/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3", + "past_key_values.1.key", + "past_key_values.1.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1", + "present.1.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "4", + "2", + "2", + "0", + "5", + "3", + "6", + "0", + "6", + "2" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.1.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1" + ], + "const_args": [ + "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_2", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.1/input_layernorm/output_3.out4_1", + "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4" + ], + "const_args": [ + "model.layers.1.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.1/post_attention_layernorm/output_3.out4_2", + "/model/layers.1/post_attention_layernorm/output_0.out4_2" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_1", + "type": "FlatMLP", + "in_args": [ + "/model/layers.1/post_attention_layernorm/output_0.out4_2" + ], + "const_args": [ + "model.layers.1.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.1.mlp.up_proj.MatMulNBits.qweight", + "model.layers.1.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.1.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.1.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.1/mlp/Mul/output_0.out3_1" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.1.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.1/mlp/Mul/output_0.out3_1" + ], + "const_args": [ + "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_3", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.1/post_attention_layernorm/output_3.out4_2", + "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5" + ], + "const_args": [ + "model.layers.2.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.2/input_layernorm/output_3.out4_3", + "/model/layers.2/input_layernorm/output_0.out4_3" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_2", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.2/input_layernorm/output_0.out4_3" + ], + "const_args": [ + "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.2.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.2/input_layernorm/output_0.out4_3" + ], + "const_args": [ + "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.2.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "11", + "5" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.2/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6", + "past_key_values.2.key", + "past_key_values.2.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2", + "present.2.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "8", + "4", + "2", + "0", + "9", + "5", + "6", + "0", + "10", + "4" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.2.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2" + ], + "const_args": [ + "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_4", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.2/input_layernorm/output_3.out4_3", + "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7" + ], + "const_args": [ + "model.layers.2.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.2/post_attention_layernorm/output_3.out4_4", + "/model/layers.2/post_attention_layernorm/output_0.out4_4" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_2", + "type": "FlatMLP", + "in_args": [ + "/model/layers.2/post_attention_layernorm/output_0.out4_4" + ], + "const_args": [ + "model.layers.2.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.2.mlp.up_proj.MatMulNBits.qweight", + "model.layers.2.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.2.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.2.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.2/mlp/Mul/output_0.out3_2" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.2.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.2/mlp/Mul/output_0.out3_2" + ], + "const_args": [ + "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_5", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.2/post_attention_layernorm/output_3.out4_4", + "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8" + ], + "const_args": [ + "model.layers.3.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.3/input_layernorm/output_3.out4_5", + "/model/layers.3/input_layernorm/output_0.out4_5" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_3", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.3/input_layernorm/output_0.out4_5" + ], + "const_args": [ + "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.3.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.3/input_layernorm/output_0.out4_5" + ], + "const_args": [ + "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.3.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "15", + "7" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.3/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9", + "past_key_values.3.key", + "past_key_values.3.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3", + "present.3.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "12", + "6", + "2", + "0", + "13", + "7", + "6", + "0", + "14", + "6" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.3.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3" + ], + "const_args": [ + "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_6", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.3/input_layernorm/output_3.out4_5", + "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10" + ], + "const_args": [ + "model.layers.3.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.3/post_attention_layernorm/output_3.out4_6", + "/model/layers.3/post_attention_layernorm/output_0.out4_6" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_3", + "type": "FlatMLP", + "in_args": [ + "/model/layers.3/post_attention_layernorm/output_0.out4_6" + ], + "const_args": [ + "model.layers.3.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.3.mlp.up_proj.MatMulNBits.qweight", + "model.layers.3.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.3.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.3.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.3/mlp/Mul/output_0.out3_3" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.3.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.3/mlp/Mul/output_0.out3_3" + ], + "const_args": [ + "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_7", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.3/post_attention_layernorm/output_3.out4_6", + "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11" + ], + "const_args": [ + "model.layers.4.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.4/input_layernorm/output_3.out4_7", + "/model/layers.4/input_layernorm/output_0.out4_7" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_4", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.4/input_layernorm/output_0.out4_7" + ], + "const_args": [ + "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.4.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.4/input_layernorm/output_0.out4_7" + ], + "const_args": [ + "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.4.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "19", + "9" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.4/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12", + "past_key_values.4.key", + "past_key_values.4.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4", + "present.4.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "16", + "8", + "2", + "0", + "17", + "9", + "6", + "0", + "18", + "8" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.4.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4" + ], + "const_args": [ + "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_8", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.4/input_layernorm/output_3.out4_7", + "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13" + ], + "const_args": [ + "model.layers.4.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.4/post_attention_layernorm/output_3.out4_8", + "/model/layers.4/post_attention_layernorm/output_0.out4_8" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_4", + "type": "FlatMLP", + "in_args": [ + "/model/layers.4/post_attention_layernorm/output_0.out4_8" + ], + "const_args": [ + "model.layers.4.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.4.mlp.up_proj.MatMulNBits.qweight", + "model.layers.4.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.4.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.4.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.4/mlp/Mul/output_0.out3_4" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.4.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.4/mlp/Mul/output_0.out3_4" + ], + "const_args": [ + "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_9", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.4/post_attention_layernorm/output_3.out4_8", + "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14" + ], + "const_args": [ + "model.layers.5.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.5/input_layernorm/output_3.out4_9", + "/model/layers.5/input_layernorm/output_0.out4_9" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_5", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.5/input_layernorm/output_0.out4_9" + ], + "const_args": [ + "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.5.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.5/input_layernorm/output_0.out4_9" + ], + "const_args": [ + "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.5.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "23", + "11" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.5/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15", + "past_key_values.5.key", + "past_key_values.5.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5", + "present.5.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "20", + "10", + "2", + "0", + "21", + "11", + "6", + "0", + "22", + "10" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.5.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5" + ], + "const_args": [ + "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_10", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.5/input_layernorm/output_3.out4_9", + "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16" + ], + "const_args": [ + "model.layers.5.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.5/post_attention_layernorm/output_3.out4_10", + "/model/layers.5/post_attention_layernorm/output_0.out4_10" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_5", + "type": "FlatMLP", + "in_args": [ + "/model/layers.5/post_attention_layernorm/output_0.out4_10" + ], + "const_args": [ + "model.layers.5.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.5.mlp.up_proj.MatMulNBits.qweight", + "model.layers.5.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.5.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.5.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.5/mlp/Mul/output_0.out3_5" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.5.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.5/mlp/Mul/output_0.out3_5" + ], + "const_args": [ + "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_11", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.5/post_attention_layernorm/output_3.out4_10", + "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17" + ], + "const_args": [ + "model.layers.6.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.6/input_layernorm/output_3.out4_11", + "/model/layers.6/input_layernorm/output_0.out4_11" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_6", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.6/input_layernorm/output_0.out4_11" + ], + "const_args": [ + "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.6.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.6/input_layernorm/output_0.out4_11" + ], + "const_args": [ + "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.6.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "27", + "13" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.6/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18", + "past_key_values.6.key", + "past_key_values.6.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6", + "present.6.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "24", + "12", + "2", + "0", + "25", + "13", + "6", + "0", + "26", + "12" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.6.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6" + ], + "const_args": [ + "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_12", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.6/input_layernorm/output_3.out4_11", + "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19" + ], + "const_args": [ + "model.layers.6.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.6/post_attention_layernorm/output_3.out4_12", + "/model/layers.6/post_attention_layernorm/output_0.out4_12" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_6", + "type": "FlatMLP", + "in_args": [ + "/model/layers.6/post_attention_layernorm/output_0.out4_12" + ], + "const_args": [ + "model.layers.6.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.6.mlp.up_proj.MatMulNBits.qweight", + "model.layers.6.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.6.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.6.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.6/mlp/Mul/output_0.out3_6" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.6.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.6/mlp/Mul/output_0.out3_6" + ], + "const_args": [ + "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_13", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.6/post_attention_layernorm/output_3.out4_12", + "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20" + ], + "const_args": [ + "model.layers.7.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.7/input_layernorm/output_3.out4_13", + "/model/layers.7/input_layernorm/output_0.out4_13" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_7", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.7/input_layernorm/output_0.out4_13" + ], + "const_args": [ + "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.7.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.7/input_layernorm/output_0.out4_13" + ], + "const_args": [ + "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.7.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "31", + "15" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.7/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21", + "past_key_values.7.key", + "past_key_values.7.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7", + "present.7.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "28", + "14", + "2", + "0", + "29", + "15", + "6", + "0", + "30", + "14" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.7.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7" + ], + "const_args": [ + "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_14", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.7/input_layernorm/output_3.out4_13", + "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22" + ], + "const_args": [ + "model.layers.7.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.7/post_attention_layernorm/output_3.out4_14", + "/model/layers.7/post_attention_layernorm/output_0.out4_14" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_7", + "type": "FlatMLP", + "in_args": [ + "/model/layers.7/post_attention_layernorm/output_0.out4_14" + ], + "const_args": [ + "model.layers.7.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.7.mlp.up_proj.MatMulNBits.qweight", + "model.layers.7.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.7.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.7.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.7/mlp/Mul/output_0.out3_7" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.7.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.7/mlp/Mul/output_0.out3_7" + ], + "const_args": [ + "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_15", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.7/post_attention_layernorm/output_3.out4_14", + "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23" + ], + "const_args": [ + "model.layers.8.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.8/input_layernorm/output_3.out4_15", + "/model/layers.8/input_layernorm/output_0.out4_15" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_8", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.8/input_layernorm/output_0.out4_15" + ], + "const_args": [ + "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.8.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.8/input_layernorm/output_0.out4_15" + ], + "const_args": [ + "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.8.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "35", + "17" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.8/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24", + "past_key_values.8.key", + "past_key_values.8.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8", + "present.8.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "32", + "16", + "2", + "0", + "33", + "17", + "6", + "0", + "34", + "16" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.8.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8" + ], + "const_args": [ + "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_16", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.8/input_layernorm/output_3.out4_15", + "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25" + ], + "const_args": [ + "model.layers.8.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.8/post_attention_layernorm/output_3.out4_16", + "/model/layers.8/post_attention_layernorm/output_0.out4_16" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_8", + "type": "FlatMLP", + "in_args": [ + "/model/layers.8/post_attention_layernorm/output_0.out4_16" + ], + "const_args": [ + "model.layers.8.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.8.mlp.up_proj.MatMulNBits.qweight", + "model.layers.8.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.8.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.8.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.8/mlp/Mul/output_0.out3_8" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.8.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.8/mlp/Mul/output_0.out3_8" + ], + "const_args": [ + "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_17", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.8/post_attention_layernorm/output_3.out4_16", + "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26" + ], + "const_args": [ + "model.layers.9.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.9/input_layernorm/output_3.out4_17", + "/model/layers.9/input_layernorm/output_0.out4_17" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_9", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.9/input_layernorm/output_0.out4_17" + ], + "const_args": [ + "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.9.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.9/input_layernorm/output_0.out4_17" + ], + "const_args": [ + "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.9.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "39", + "19" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.9/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27", + "past_key_values.9.key", + "past_key_values.9.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9", + "present.9.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "36", + "18", + "2", + "0", + "37", + "19", + "6", + "0", + "38", + "18" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.9.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9" + ], + "const_args": [ + "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_18", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.9/input_layernorm/output_3.out4_17", + "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28" + ], + "const_args": [ + "model.layers.9.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.9/post_attention_layernorm/output_3.out4_18", + "/model/layers.9/post_attention_layernorm/output_0.out4_18" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_9", + "type": "FlatMLP", + "in_args": [ + "/model/layers.9/post_attention_layernorm/output_0.out4_18" + ], + "const_args": [ + "model.layers.9.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.9.mlp.up_proj.MatMulNBits.qweight", + "model.layers.9.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.9.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.9.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.9/mlp/Mul/output_0.out3_9" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.9.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.9/mlp/Mul/output_0.out3_9" + ], + "const_args": [ + "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_19", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.9/post_attention_layernorm/output_3.out4_18", + "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29" + ], + "const_args": [ + "model.layers.10.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.10/input_layernorm/output_3.out4_19", + "/model/layers.10/input_layernorm/output_0.out4_19" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_10", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.10/input_layernorm/output_0.out4_19" + ], + "const_args": [ + "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.10.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.10/input_layernorm/output_0.out4_19" + ], + "const_args": [ + "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.10.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "43", + "21" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.10/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30", + "past_key_values.10.key", + "past_key_values.10.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10", + "present.10.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "40", + "20", + "2", + "0", + "41", + "21", + "6", + "0", + "42", + "20" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.10.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10" + ], + "const_args": [ + "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_20", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.10/input_layernorm/output_3.out4_19", + "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31" + ], + "const_args": [ + "model.layers.10.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.10/post_attention_layernorm/output_3.out4_20", + "/model/layers.10/post_attention_layernorm/output_0.out4_20" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_10", + "type": "FlatMLP", + "in_args": [ + "/model/layers.10/post_attention_layernorm/output_0.out4_20" + ], + "const_args": [ + "model.layers.10.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.10.mlp.up_proj.MatMulNBits.qweight", + "model.layers.10.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.10.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.10.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.10/mlp/Mul/output_0.out3_10" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.10.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.10/mlp/Mul/output_0.out3_10" + ], + "const_args": [ + "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_21", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.10/post_attention_layernorm/output_3.out4_20", + "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32" + ], + "const_args": [ + "model.layers.11.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.11/input_layernorm/output_3.out4_21", + "/model/layers.11/input_layernorm/output_0.out4_21" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_11", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.11/input_layernorm/output_0.out4_21" + ], + "const_args": [ + "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.11.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.11/input_layernorm/output_0.out4_21" + ], + "const_args": [ + "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.11.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "47", + "23" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.11/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33", + "past_key_values.11.key", + "past_key_values.11.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11", + "present.11.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "44", + "22", + "2", + "0", + "45", + "23", + "6", + "0", + "46", + "22" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.11.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11" + ], + "const_args": [ + "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_22", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.11/input_layernorm/output_3.out4_21", + "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34" + ], + "const_args": [ + "model.layers.11.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.11/post_attention_layernorm/output_3.out4_22", + "/model/layers.11/post_attention_layernorm/output_0.out4_22" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_11", + "type": "FlatMLP", + "in_args": [ + "/model/layers.11/post_attention_layernorm/output_0.out4_22" + ], + "const_args": [ + "model.layers.11.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.11.mlp.up_proj.MatMulNBits.qweight", + "model.layers.11.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.11.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.11.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.11/mlp/Mul/output_0.out3_11" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.11.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.11/mlp/Mul/output_0.out3_11" + ], + "const_args": [ + "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_23", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.11/post_attention_layernorm/output_3.out4_22", + "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35" + ], + "const_args": [ + "model.layers.12.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.12/input_layernorm/output_3.out4_23", + "/model/layers.12/input_layernorm/output_0.out4_23" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_12", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.12/input_layernorm/output_0.out4_23" + ], + "const_args": [ + "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.12.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.12/input_layernorm/output_0.out4_23" + ], + "const_args": [ + "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.12.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "51", + "25" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.12/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36", + "past_key_values.12.key", + "past_key_values.12.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12", + "present.12.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "48", + "24", + "2", + "0", + "49", + "25", + "6", + "0", + "50", + "24" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.12.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12" + ], + "const_args": [ + "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_24", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.12/input_layernorm/output_3.out4_23", + "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37" + ], + "const_args": [ + "model.layers.12.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.12/post_attention_layernorm/output_3.out4_24", + "/model/layers.12/post_attention_layernorm/output_0.out4_24" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_12", + "type": "FlatMLP", + "in_args": [ + "/model/layers.12/post_attention_layernorm/output_0.out4_24" + ], + "const_args": [ + "model.layers.12.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.12.mlp.up_proj.MatMulNBits.qweight", + "model.layers.12.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.12.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.12.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.12/mlp/Mul/output_0.out3_12" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.12.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.12/mlp/Mul/output_0.out3_12" + ], + "const_args": [ + "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_25", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.12/post_attention_layernorm/output_3.out4_24", + "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38" + ], + "const_args": [ + "model.layers.13.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.13/input_layernorm/output_3.out4_25", + "/model/layers.13/input_layernorm/output_0.out4_25" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_13", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.13/input_layernorm/output_0.out4_25" + ], + "const_args": [ + "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.13.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.13/input_layernorm/output_0.out4_25" + ], + "const_args": [ + "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.13.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "55", + "27" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.13/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39", + "past_key_values.13.key", + "past_key_values.13.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13", + "present.13.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "52", + "26", + "2", + "0", + "53", + "27", + "6", + "0", + "54", + "26" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.13.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13" + ], + "const_args": [ + "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_26", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.13/input_layernorm/output_3.out4_25", + "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40" + ], + "const_args": [ + "model.layers.13.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.13/post_attention_layernorm/output_3.out4_26", + "/model/layers.13/post_attention_layernorm/output_0.out4_26" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_13", + "type": "FlatMLP", + "in_args": [ + "/model/layers.13/post_attention_layernorm/output_0.out4_26" + ], + "const_args": [ + "model.layers.13.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.13.mlp.up_proj.MatMulNBits.qweight", + "model.layers.13.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.13.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.13.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.13/mlp/Mul/output_0.out3_13" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.13.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.13/mlp/Mul/output_0.out3_13" + ], + "const_args": [ + "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_27", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.13/post_attention_layernorm/output_3.out4_26", + "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41" + ], + "const_args": [ + "model.layers.14.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.14/input_layernorm/output_3.out4_27", + "/model/layers.14/input_layernorm/output_0.out4_27" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_14", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.14/input_layernorm/output_0.out4_27" + ], + "const_args": [ + "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.14.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.14/input_layernorm/output_0.out4_27" + ], + "const_args": [ + "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.14.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "59", + "29" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.14/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42", + "past_key_values.14.key", + "past_key_values.14.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14", + "present.14.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "56", + "28", + "2", + "0", + "57", + "29", + "6", + "0", + "58", + "28" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.14.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14" + ], + "const_args": [ + "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_28", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.14/input_layernorm/output_3.out4_27", + "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43" + ], + "const_args": [ + "model.layers.14.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.14/post_attention_layernorm/output_3.out4_28", + "/model/layers.14/post_attention_layernorm/output_0.out4_28" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_14", + "type": "FlatMLP", + "in_args": [ + "/model/layers.14/post_attention_layernorm/output_0.out4_28" + ], + "const_args": [ + "model.layers.14.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.14.mlp.up_proj.MatMulNBits.qweight", + "model.layers.14.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.14.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.14.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.14/mlp/Mul/output_0.out3_14" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.14.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.14/mlp/Mul/output_0.out3_14" + ], + "const_args": [ + "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_29", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.14/post_attention_layernorm/output_3.out4_28", + "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44" + ], + "const_args": [ + "model.layers.15.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.15/input_layernorm/output_3.out4_29", + "/model/layers.15/input_layernorm/output_0.out4_29" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_15", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.15/input_layernorm/output_0.out4_29" + ], + "const_args": [ + "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.15.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.15/input_layernorm/output_0.out4_29" + ], + "const_args": [ + "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.15.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "63", + "31" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.15/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45", + "past_key_values.15.key", + "past_key_values.15.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15", + "present.15.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "60", + "30", + "2", + "0", + "61", + "31", + "6", + "0", + "62", + "30" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.15.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15" + ], + "const_args": [ + "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_30", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.15/input_layernorm/output_3.out4_29", + "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46" + ], + "const_args": [ + "model.layers.15.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.15/post_attention_layernorm/output_3.out4_30", + "/model/layers.15/post_attention_layernorm/output_0.out4_30" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_15", + "type": "FlatMLP", + "in_args": [ + "/model/layers.15/post_attention_layernorm/output_0.out4_30" + ], + "const_args": [ + "model.layers.15.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.15.mlp.up_proj.MatMulNBits.qweight", + "model.layers.15.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.15.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.15.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.15/mlp/Mul/output_0.out3_15" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.15.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.15/mlp/Mul/output_0.out3_15" + ], + "const_args": [ + "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_31", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.15/post_attention_layernorm/output_3.out4_30", + "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47" + ], + "const_args": [ + "model.layers.16.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.16/input_layernorm/output_3.out4_31", + "/model/layers.16/input_layernorm/output_0.out4_31" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_16", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.16/input_layernorm/output_0.out4_31" + ], + "const_args": [ + "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.16.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.16/input_layernorm/output_0.out4_31" + ], + "const_args": [ + "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.16.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "67", + "33" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.16/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48", + "past_key_values.16.key", + "past_key_values.16.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16", + "present.16.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "64", + "32", + "2", + "0", + "65", + "33", + "6", + "0", + "66", + "32" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.16.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16" + ], + "const_args": [ + "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_32", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.16/input_layernorm/output_3.out4_31", + "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49" + ], + "const_args": [ + "model.layers.16.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.16/post_attention_layernorm/output_3.out4_32", + "/model/layers.16/post_attention_layernorm/output_0.out4_32" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_16", + "type": "FlatMLP", + "in_args": [ + "/model/layers.16/post_attention_layernorm/output_0.out4_32" + ], + "const_args": [ + "model.layers.16.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.16.mlp.up_proj.MatMulNBits.qweight", + "model.layers.16.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.16.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.16.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.16/mlp/Mul/output_0.out3_16" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.16.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.16/mlp/Mul/output_0.out3_16" + ], + "const_args": [ + "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_33", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.16/post_attention_layernorm/output_3.out4_32", + "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50" + ], + "const_args": [ + "model.layers.17.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.17/input_layernorm/output_3.out4_33", + "/model/layers.17/input_layernorm/output_0.out4_33" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_17", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.17/input_layernorm/output_0.out4_33" + ], + "const_args": [ + "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.17.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.17/input_layernorm/output_0.out4_33" + ], + "const_args": [ + "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.17.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "71", + "35" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.17/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51", + "past_key_values.17.key", + "past_key_values.17.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17", + "present.17.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "68", + "34", + "2", + "0", + "69", + "35", + "6", + "0", + "70", + "34" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.17.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17" + ], + "const_args": [ + "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_34", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.17/input_layernorm/output_3.out4_33", + "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52" + ], + "const_args": [ + "model.layers.17.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.17/post_attention_layernorm/output_3.out4_34", + "/model/layers.17/post_attention_layernorm/output_0.out4_34" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_17", + "type": "FlatMLP", + "in_args": [ + "/model/layers.17/post_attention_layernorm/output_0.out4_34" + ], + "const_args": [ + "model.layers.17.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.17.mlp.up_proj.MatMulNBits.qweight", + "model.layers.17.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.17.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.17.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.17/mlp/Mul/output_0.out3_17" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.17.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.17/mlp/Mul/output_0.out3_17" + ], + "const_args": [ + "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_35", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.17/post_attention_layernorm/output_3.out4_34", + "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53" + ], + "const_args": [ + "model.layers.18.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.18/input_layernorm/output_3.out4_35", + "/model/layers.18/input_layernorm/output_0.out4_35" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_18", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.18/input_layernorm/output_0.out4_35" + ], + "const_args": [ + "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.18.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.18/input_layernorm/output_0.out4_35" + ], + "const_args": [ + "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.18.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "75", + "37" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.18/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54", + "past_key_values.18.key", + "past_key_values.18.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18", + "present.18.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "72", + "36", + "2", + "0", + "73", + "37", + "6", + "0", + "74", + "36" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.18.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18" + ], + "const_args": [ + "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_36", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.18/input_layernorm/output_3.out4_35", + "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55" + ], + "const_args": [ + "model.layers.18.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.18/post_attention_layernorm/output_3.out4_36", + "/model/layers.18/post_attention_layernorm/output_0.out4_36" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_18", + "type": "FlatMLP", + "in_args": [ + "/model/layers.18/post_attention_layernorm/output_0.out4_36" + ], + "const_args": [ + "model.layers.18.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.18.mlp.up_proj.MatMulNBits.qweight", + "model.layers.18.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.18.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.18.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.18/mlp/Mul/output_0.out3_18" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.18.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.18/mlp/Mul/output_0.out3_18" + ], + "const_args": [ + "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_37", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.18/post_attention_layernorm/output_3.out4_36", + "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56" + ], + "const_args": [ + "model.layers.19.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.19/input_layernorm/output_3.out4_37", + "/model/layers.19/input_layernorm/output_0.out4_37" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_19", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.19/input_layernorm/output_0.out4_37" + ], + "const_args": [ + "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.19.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.19/input_layernorm/output_0.out4_37" + ], + "const_args": [ + "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.19.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "79", + "39" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.19/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57", + "past_key_values.19.key", + "past_key_values.19.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19", + "present.19.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "76", + "38", + "2", + "0", + "77", + "39", + "6", + "0", + "78", + "38" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.19.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19" + ], + "const_args": [ + "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_38", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.19/input_layernorm/output_3.out4_37", + "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58" + ], + "const_args": [ + "model.layers.19.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.19/post_attention_layernorm/output_3.out4_38", + "/model/layers.19/post_attention_layernorm/output_0.out4_38" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_19", + "type": "FlatMLP", + "in_args": [ + "/model/layers.19/post_attention_layernorm/output_0.out4_38" + ], + "const_args": [ + "model.layers.19.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.19.mlp.up_proj.MatMulNBits.qweight", + "model.layers.19.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.19.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.19.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.19/mlp/Mul/output_0.out3_19" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.19.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.19/mlp/Mul/output_0.out3_19" + ], + "const_args": [ + "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_39", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.19/post_attention_layernorm/output_3.out4_38", + "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59" + ], + "const_args": [ + "model.layers.20.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.20/input_layernorm/output_3.out4_39", + "/model/layers.20/input_layernorm/output_0.out4_39" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_20", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.20/input_layernorm/output_0.out4_39" + ], + "const_args": [ + "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.20.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.20/input_layernorm/output_0.out4_39" + ], + "const_args": [ + "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.20.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "83", + "41" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.20/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60", + "past_key_values.20.key", + "past_key_values.20.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20", + "present.20.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "80", + "40", + "2", + "0", + "81", + "41", + "6", + "0", + "82", + "40" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.20.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20" + ], + "const_args": [ + "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_40", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.20/input_layernorm/output_3.out4_39", + "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61" + ], + "const_args": [ + "model.layers.20.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.20/post_attention_layernorm/output_3.out4_40", + "/model/layers.20/post_attention_layernorm/output_0.out4_40" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_20", + "type": "FlatMLP", + "in_args": [ + "/model/layers.20/post_attention_layernorm/output_0.out4_40" + ], + "const_args": [ + "model.layers.20.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.20.mlp.up_proj.MatMulNBits.qweight", + "model.layers.20.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.20.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.20.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.20/mlp/Mul/output_0.out3_20" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.20.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.20/mlp/Mul/output_0.out3_20" + ], + "const_args": [ + "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_41", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.20/post_attention_layernorm/output_3.out4_40", + "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62" + ], + "const_args": [ + "model.layers.21.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.21/input_layernorm/output_3.out4_41", + "/model/layers.21/input_layernorm/output_0.out4_41" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_21", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.21/input_layernorm/output_0.out4_41" + ], + "const_args": [ + "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.21.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.21/input_layernorm/output_0.out4_41" + ], + "const_args": [ + "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.21.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "87", + "43" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.21/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63", + "past_key_values.21.key", + "past_key_values.21.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21", + "present.21.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "84", + "42", + "2", + "0", + "85", + "43", + "6", + "0", + "86", + "42" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.21.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21" + ], + "const_args": [ + "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_42", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.21/input_layernorm/output_3.out4_41", + "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64" + ], + "const_args": [ + "model.layers.21.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.21/post_attention_layernorm/output_3.out4_42", + "/model/layers.21/post_attention_layernorm/output_0.out4_42" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_21", + "type": "FlatMLP", + "in_args": [ + "/model/layers.21/post_attention_layernorm/output_0.out4_42" + ], + "const_args": [ + "model.layers.21.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.21.mlp.up_proj.MatMulNBits.qweight", + "model.layers.21.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.21.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.21.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.21/mlp/Mul/output_0.out3_21" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.21.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.21/mlp/Mul/output_0.out3_21" + ], + "const_args": [ + "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_43", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.21/post_attention_layernorm/output_3.out4_42", + "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65" + ], + "const_args": [ + "model.layers.22.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.22/input_layernorm/output_3.out4_43", + "/model/layers.22/input_layernorm/output_0.out4_43" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_22", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.22/input_layernorm/output_0.out4_43" + ], + "const_args": [ + "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.22.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.22/input_layernorm/output_0.out4_43" + ], + "const_args": [ + "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.22.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "91", + "45" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.22/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66", + "past_key_values.22.key", + "past_key_values.22.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22", + "present.22.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "88", + "44", + "2", + "0", + "89", + "45", + "6", + "0", + "90", + "44" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.22.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22" + ], + "const_args": [ + "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_44", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.22/input_layernorm/output_3.out4_43", + "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67" + ], + "const_args": [ + "model.layers.22.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.22/post_attention_layernorm/output_3.out4_44", + "/model/layers.22/post_attention_layernorm/output_0.out4_44" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_22", + "type": "FlatMLP", + "in_args": [ + "/model/layers.22/post_attention_layernorm/output_0.out4_44" + ], + "const_args": [ + "model.layers.22.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.22.mlp.up_proj.MatMulNBits.qweight", + "model.layers.22.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.22.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.22.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.22/mlp/Mul/output_0.out3_22" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.22.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.22/mlp/Mul/output_0.out3_22" + ], + "const_args": [ + "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_45", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.22/post_attention_layernorm/output_3.out4_44", + "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68" + ], + "const_args": [ + "model.layers.23.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.23/input_layernorm/output_3.out4_45", + "/model/layers.23/input_layernorm/output_0.out4_45" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_23", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.23/input_layernorm/output_0.out4_45" + ], + "const_args": [ + "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.23.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.23/input_layernorm/output_0.out4_45" + ], + "const_args": [ + "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.23.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "95", + "47" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.23/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69", + "past_key_values.23.key", + "past_key_values.23.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23", + "present.23.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "92", + "46", + "2", + "0", + "93", + "47", + "6", + "0", + "94", + "46" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.23.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23" + ], + "const_args": [ + "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_46", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.23/input_layernorm/output_3.out4_45", + "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70" + ], + "const_args": [ + "model.layers.23.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.23/post_attention_layernorm/output_3.out4_46", + "/model/layers.23/post_attention_layernorm/output_0.out4_46" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_23", + "type": "FlatMLP", + "in_args": [ + "/model/layers.23/post_attention_layernorm/output_0.out4_46" + ], + "const_args": [ + "model.layers.23.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.23.mlp.up_proj.MatMulNBits.qweight", + "model.layers.23.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.23.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.23.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.23/mlp/Mul/output_0.out3_23" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.23.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.23/mlp/Mul/output_0.out3_23" + ], + "const_args": [ + "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_47", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.23/post_attention_layernorm/output_3.out4_46", + "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71" + ], + "const_args": [ + "model.layers.24.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.24/input_layernorm/output_3.out4_47", + "/model/layers.24/input_layernorm/output_0.out4_47" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_24", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.24/input_layernorm/output_0.out4_47" + ], + "const_args": [ + "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.24.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.24/input_layernorm/output_0.out4_47" + ], + "const_args": [ + "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.24.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "99", + "49" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.24/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72", + "past_key_values.24.key", + "past_key_values.24.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24", + "present.24.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "96", + "48", + "2", + "0", + "97", + "49", + "6", + "0", + "98", + "48" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.24.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24" + ], + "const_args": [ + "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_48", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.24/input_layernorm/output_3.out4_47", + "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73" + ], + "const_args": [ + "model.layers.24.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.24/post_attention_layernorm/output_3.out4_48", + "/model/layers.24/post_attention_layernorm/output_0.out4_48" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_24", + "type": "FlatMLP", + "in_args": [ + "/model/layers.24/post_attention_layernorm/output_0.out4_48" + ], + "const_args": [ + "model.layers.24.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.24.mlp.up_proj.MatMulNBits.qweight", + "model.layers.24.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.24.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.24.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.24/mlp/Mul/output_0.out3_24" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.24.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.24/mlp/Mul/output_0.out3_24" + ], + "const_args": [ + "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_49", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.24/post_attention_layernorm/output_3.out4_48", + "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74" + ], + "const_args": [ + "model.layers.25.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.25/input_layernorm/output_3.out4_49", + "/model/layers.25/input_layernorm/output_0.out4_49" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_25", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.25/input_layernorm/output_0.out4_49" + ], + "const_args": [ + "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.25.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.25/input_layernorm/output_0.out4_49" + ], + "const_args": [ + "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.25.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "103", + "51" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.25/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75", + "past_key_values.25.key", + "past_key_values.25.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25", + "present.25.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "100", + "50", + "2", + "0", + "101", + "51", + "6", + "0", + "102", + "50" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.25.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25" + ], + "const_args": [ + "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_50", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.25/input_layernorm/output_3.out4_49", + "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76" + ], + "const_args": [ + "model.layers.25.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.25/post_attention_layernorm/output_3.out4_50", + "/model/layers.25/post_attention_layernorm/output_0.out4_50" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_25", + "type": "FlatMLP", + "in_args": [ + "/model/layers.25/post_attention_layernorm/output_0.out4_50" + ], + "const_args": [ + "model.layers.25.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.25.mlp.up_proj.MatMulNBits.qweight", + "model.layers.25.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.25.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.25.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.25/mlp/Mul/output_0.out3_25" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.25.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.25/mlp/Mul/output_0.out3_25" + ], + "const_args": [ + "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_51", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.25/post_attention_layernorm/output_3.out4_50", + "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77" + ], + "const_args": [ + "model.layers.26.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.26/input_layernorm/output_3.out4_51", + "/model/layers.26/input_layernorm/output_0.out4_51" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_26", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.26/input_layernorm/output_0.out4_51" + ], + "const_args": [ + "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.26.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.26/input_layernorm/output_0.out4_51" + ], + "const_args": [ + "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.26.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "107", + "53" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.26/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78", + "past_key_values.26.key", + "past_key_values.26.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26", + "present.26.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "104", + "52", + "2", + "0", + "105", + "53", + "6", + "0", + "106", + "52" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.26.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26" + ], + "const_args": [ + "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_52", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.26/input_layernorm/output_3.out4_51", + "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79" + ], + "const_args": [ + "model.layers.26.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.26/post_attention_layernorm/output_3.out4_52", + "/model/layers.26/post_attention_layernorm/output_0.out4_52" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_26", + "type": "FlatMLP", + "in_args": [ + "/model/layers.26/post_attention_layernorm/output_0.out4_52" + ], + "const_args": [ + "model.layers.26.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.26.mlp.up_proj.MatMulNBits.qweight", + "model.layers.26.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.26.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.26.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.26/mlp/Mul/output_0.out3_26" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.26.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.26/mlp/Mul/output_0.out3_26" + ], + "const_args": [ + "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_53", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.26/post_attention_layernorm/output_3.out4_52", + "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80" + ], + "const_args": [ + "model.layers.27.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.27/input_layernorm/output_3.out4_53", + "/model/layers.27/input_layernorm/output_0.out4_53" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_27", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.27/input_layernorm/output_0.out4_53" + ], + "const_args": [ + "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.27.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.27/input_layernorm/output_0.out4_53" + ], + "const_args": [ + "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.27.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "111", + "55" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.27/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81", + "past_key_values.27.key", + "past_key_values.27.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27", + "present.27.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "108", + "54", + "2", + "0", + "109", + "55", + "6", + "0", + "110", + "54" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.27.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27" + ], + "const_args": [ + "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_54", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.27/input_layernorm/output_3.out4_53", + "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82" + ], + "const_args": [ + "model.layers.27.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.27/post_attention_layernorm/output_3.out4_54", + "/model/layers.27/post_attention_layernorm/output_0.out4_54" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_27", + "type": "FlatMLP", + "in_args": [ + "/model/layers.27/post_attention_layernorm/output_0.out4_54" + ], + "const_args": [ + "model.layers.27.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.27.mlp.up_proj.MatMulNBits.qweight", + "model.layers.27.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.27.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.27.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.27/mlp/Mul/output_0.out3_27" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.27.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.27/mlp/Mul/output_0.out3_27" + ], + "const_args": [ + "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_55", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.27/post_attention_layernorm/output_3.out4_54", + "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83" + ], + "const_args": [ + "model.layers.28.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.28/input_layernorm/output_3.out4_55", + "/model/layers.28/input_layernorm/output_0.out4_55" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_28", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.28/input_layernorm/output_0.out4_55" + ], + "const_args": [ + "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.28.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.28/input_layernorm/output_0.out4_55" + ], + "const_args": [ + "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.28.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "115", + "57" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.28/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84", + "past_key_values.28.key", + "past_key_values.28.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28", + "present.28.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "112", + "56", + "2", + "0", + "113", + "57", + "6", + "0", + "114", + "56" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.28.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28" + ], + "const_args": [ + "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_56", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.28/input_layernorm/output_3.out4_55", + "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85" + ], + "const_args": [ + "model.layers.28.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.28/post_attention_layernorm/output_3.out4_56", + "/model/layers.28/post_attention_layernorm/output_0.out4_56" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_28", + "type": "FlatMLP", + "in_args": [ + "/model/layers.28/post_attention_layernorm/output_0.out4_56" + ], + "const_args": [ + "model.layers.28.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.28.mlp.up_proj.MatMulNBits.qweight", + "model.layers.28.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.28.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.28.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.28/mlp/Mul/output_0.out3_28" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.28.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.28/mlp/Mul/output_0.out3_28" + ], + "const_args": [ + "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_57", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.28/post_attention_layernorm/output_3.out4_56", + "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86" + ], + "const_args": [ + "model.layers.29.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.29/input_layernorm/output_3.out4_57", + "/model/layers.29/input_layernorm/output_0.out4_57" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_29", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.29/input_layernorm/output_0.out4_57" + ], + "const_args": [ + "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.29.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.29/input_layernorm/output_0.out4_57" + ], + "const_args": [ + "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.29.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "119", + "59" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.29/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87", + "past_key_values.29.key", + "past_key_values.29.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29", + "present.29.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "116", + "58", + "2", + "0", + "117", + "59", + "6", + "0", + "118", + "58" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.29.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29" + ], + "const_args": [ + "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_58", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.29/input_layernorm/output_3.out4_57", + "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88" + ], + "const_args": [ + "model.layers.29.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.29/post_attention_layernorm/output_3.out4_58", + "/model/layers.29/post_attention_layernorm/output_0.out4_58" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_29", + "type": "FlatMLP", + "in_args": [ + "/model/layers.29/post_attention_layernorm/output_0.out4_58" + ], + "const_args": [ + "model.layers.29.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.29.mlp.up_proj.MatMulNBits.qweight", + "model.layers.29.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.29.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.29.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.29/mlp/Mul/output_0.out3_29" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.29.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.29/mlp/Mul/output_0.out3_29" + ], + "const_args": [ + "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_59", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.29/post_attention_layernorm/output_3.out4_58", + "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89" + ], + "const_args": [ + "model.layers.30.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.30/input_layernorm/output_3.out4_59", + "/model/layers.30/input_layernorm/output_0.out4_59" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_30", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.30/input_layernorm/output_0.out4_59" + ], + "const_args": [ + "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.30.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.30/input_layernorm/output_0.out4_59" + ], + "const_args": [ + "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.30.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "123", + "61" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.30/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90", + "past_key_values.30.key", + "past_key_values.30.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30", + "present.30.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "120", + "60", + "2", + "0", + "121", + "61", + "6", + "0", + "122", + "60" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.30.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30" + ], + "const_args": [ + "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_60", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.30/input_layernorm/output_3.out4_59", + "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91" + ], + "const_args": [ + "model.layers.30.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.30/post_attention_layernorm/output_3.out4_60", + "/model/layers.30/post_attention_layernorm/output_0.out4_60" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_30", + "type": "FlatMLP", + "in_args": [ + "/model/layers.30/post_attention_layernorm/output_0.out4_60" + ], + "const_args": [ + "model.layers.30.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.30.mlp.up_proj.MatMulNBits.qweight", + "model.layers.30.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.30.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.30.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.30/mlp/Mul/output_0.out3_30" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.30.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.30/mlp/Mul/output_0.out3_30" + ], + "const_args": [ + "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_61", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.30/post_attention_layernorm/output_3.out4_60", + "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92" + ], + "const_args": [ + "model.layers.31.input_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.31/input_layernorm/output_3.out4_61", + "/model/layers.31/input_layernorm/output_0.out4_61" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "MatMulNBits_2_31", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.31/input_layernorm/output_0.out4_61" + ], + "const_args": [ + "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "5120" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "layers.31.attn.v_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.31/input_layernorm/output_0.out4_61" + ], + "const_args": [ + "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "present.31.value" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "1024" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "total_seq_len": { + "type": "int", + "value": [ + "4096" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "5", + "0", + "127", + "63" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "5", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "/model/layers.31/attn/GroupQueryAttention", + "type": "FLATMHA", + "in_args": [ + "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93", + "past_key_values.31.key", + "past_key_values.31.value", + "attention_mask_const_uint", + "sin_cos_cache_token" + ], + "const_args": [], + "out_args": [ + "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31", + "present.31.key" + ], + "attrs": { + "num_heads": { + "type": "int", + "value": [ + "32" + ] + }, + "kv_num_heads": { + "type": "int", + "value": [ + "8" + ] + }, + "scale": { + "type": "float", + "value": [ + "0.0883883461356163" + ] + }, + "softcap": { + "type": "float", + "value": [ + "0.0" + ] + }, + "do_rotary": { + "type": "int", + "value": [ + "0" + ] + }, + "rotary_interleaved": { + "type": "int", + "value": [ + "0" + ] + }, + "input_shape": { + "type": "int", + "value": [ + "8", + "32", + "1", + "4096", + "128" + ] + }, + "external_buffers": { + "type": "int", + "value": [ + "4", + "1", + "0", + "0", + "1", + "0", + "124", + "62", + "2", + "0", + "125", + "63", + "6", + "0", + "126", + "62" + ] + }, + "update_tensor_offsets": { + "type": "int", + "value": [ + "4", + "0", + "0", + "256", + "6", + "0", + "0", + "256" + ] + } + } + }, + { + "name": "layers.31.attn.o_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31" + ], + "const_args": [ + "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_62", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.31/input_layernorm/output_3.out4_61", + "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94" + ], + "const_args": [ + "model.layers.31.post_attention_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.31/post_attention_layernorm/output_3.out4_62", + "/model/layers.31/post_attention_layernorm/output_0.out4_62" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "FlatMLP_3_31", + "type": "FlatMLP", + "in_args": [ + "/model/layers.31/post_attention_layernorm/output_0.out4_62" + ], + "const_args": [ + "model.layers.31.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.31.mlp.up_proj.MatMulNBits.qweight", + "model.layers.31.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.31.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.31.mlp.up_proj.MatMulNBits.bias.f" + ], + "out_args": [ + "/model/layers.31/mlp/Mul/output_0.out3_31" + ], + "attrs": { + "input_shape": { + "type": "int", + "value": [ + "1", + "4096", + "14336" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float", + "uint8", + "float" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16" + ] + } + } + }, + { + "name": "layers.31.mlp.down_proj", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.31/mlp/Mul/output_0.out3_31" + ], + "const_args": [ + "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "14336" + ] + }, + "N": { + "type": "int", + "value": [ + "4096" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + }, + { + "name": "FlatRMSAdd_4_63", + "type": "FlatRMSAdd", + "in_args": [ + "/model/layers.31/post_attention_layernorm/output_3.out4_62", + "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95" + ], + "const_args": [ + "model.layers.32.final_norm_layernorm.weight.bf" + ], + "out_args": [ + "/model/layers.32/final_norm_layernorm/output_0.dummy", + "/model/layers.32/final_norm_layernorm/output_0.out4_63" + ], + "attrs": { + "a_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "in_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "out_dtypes": { + "type": "str", + "value": [ + "bfloat16", + "bfloat16" + ] + }, + "c_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "b_shape": { + "type": "int", + "value": [ + "1", + "1", + "4096" + ] + }, + "is_gamma_ifm": { + "type": "int", + "value": [ + "1" + ] + } + } + }, + { + "name": "/lm_head/MatMulNBits", + "type": "MladfMatMul", + "in_args": [ + "/model/layers.32/final_norm_layernorm/output_0.out4_63" + ], + "const_args": [ + "lm_head.MatMulNBits.qweight.preformat", + "lm_head.MatMulNBits.bias.preformat", + "lm_head.MatMulNBits.scales.preformat", + "lm_head.MatMulNBits.qzeros.preformat" + ], + "out_args": [ + "logits.out5_4_96" + ], + "attrs": { + "accuracy_level": { + "type": "int", + "value": [ + "0" + ] + }, + "bits": { + "type": "int", + "value": [ + "4" + ] + }, + "block_size": { + "type": "int", + "value": [ + "128" + ] + }, + "K": { + "type": "int", + "value": [ + "4096" + ] + }, + "N": { + "type": "int", + "value": [ + "32768" + ] + }, + "default_shape": { + "type": "int", + "value": [ + "1" + ] + }, + "op_version": { + "type": "str", + "value": [ + "flat" + ] + }, + "group_size": { + "type": "int", + "value": [ + "128" + ] + } + } + } + ], + "fused_tensors": { + "in": { + "buffer_size": 24704, + "xrt_arg_id": 0, + "packed_tensors": [ + "/model/layers.0/input_layernorm/output_0.out5_4_0", + "attention_mask_const_uint", + "/model/embed_tokens/Gather/output_0.out4_0" + ] + }, + "out": { + "buffer_size": 73728, + "xrt_arg_id": 1, + "packed_tensors": [ + "/model/layers.32/final_norm_layernorm/output_0.dummy", + "logits.out5_4_96" + ] + }, + "scratch": { + "buffer_size": 3072000, + "xrt_arg_id": 2, + "packed_tensors": [ + "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0", + "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0", + "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1", + "/model/layers.0/post_attention_layernorm/output_3.out4_0", + "/model/layers.0/post_attention_layernorm/output_0.out4_0", + "/model/layers.0/mlp/Mul/output_0.out3_0", + "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2", + "/model/layers.1/input_layernorm/output_3.out4_1", + "/model/layers.1/input_layernorm/output_0.out4_1", + "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3", + "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1", + "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4", + "/model/layers.1/post_attention_layernorm/output_3.out4_2", + "/model/layers.1/post_attention_layernorm/output_0.out4_2", + "/model/layers.1/mlp/Mul/output_0.out3_1", + "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5", + "/model/layers.2/input_layernorm/output_3.out4_3", + "/model/layers.2/input_layernorm/output_0.out4_3", + "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6", + "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2", + "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7", + "/model/layers.2/post_attention_layernorm/output_3.out4_4", + "/model/layers.2/post_attention_layernorm/output_0.out4_4", + "/model/layers.2/mlp/Mul/output_0.out3_2", + "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8", + "/model/layers.3/input_layernorm/output_3.out4_5", + "/model/layers.3/input_layernorm/output_0.out4_5", + "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9", + "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3", + "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10", + "/model/layers.3/post_attention_layernorm/output_3.out4_6", + "/model/layers.3/post_attention_layernorm/output_0.out4_6", + "/model/layers.3/mlp/Mul/output_0.out3_3", + "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11", + "/model/layers.4/input_layernorm/output_3.out4_7", + "/model/layers.4/input_layernorm/output_0.out4_7", + "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12", + "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4", + "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13", + "/model/layers.4/post_attention_layernorm/output_3.out4_8", + "/model/layers.4/post_attention_layernorm/output_0.out4_8", + "/model/layers.4/mlp/Mul/output_0.out3_4", + "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14", + "/model/layers.5/input_layernorm/output_3.out4_9", + "/model/layers.5/input_layernorm/output_0.out4_9", + "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15", + "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5", + "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16", + "/model/layers.5/post_attention_layernorm/output_3.out4_10", + "/model/layers.5/post_attention_layernorm/output_0.out4_10", + "/model/layers.5/mlp/Mul/output_0.out3_5", + "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17", + "/model/layers.6/input_layernorm/output_3.out4_11", + "/model/layers.6/input_layernorm/output_0.out4_11", + "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18", + "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6", + "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19", + "/model/layers.6/post_attention_layernorm/output_3.out4_12", + "/model/layers.6/post_attention_layernorm/output_0.out4_12", + "/model/layers.6/mlp/Mul/output_0.out3_6", + "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20", + "/model/layers.7/input_layernorm/output_3.out4_13", + "/model/layers.7/input_layernorm/output_0.out4_13", + "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21", + "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7", + "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22", + "/model/layers.7/post_attention_layernorm/output_3.out4_14", + "/model/layers.7/post_attention_layernorm/output_0.out4_14", + "/model/layers.7/mlp/Mul/output_0.out3_7", + "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23", + "/model/layers.8/input_layernorm/output_3.out4_15", + "/model/layers.8/input_layernorm/output_0.out4_15", + "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24", + "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8", + "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25", + "/model/layers.8/post_attention_layernorm/output_3.out4_16", + "/model/layers.8/post_attention_layernorm/output_0.out4_16", + "/model/layers.8/mlp/Mul/output_0.out3_8", + "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26", + "/model/layers.9/input_layernorm/output_3.out4_17", + "/model/layers.9/input_layernorm/output_0.out4_17", + "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27", + "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9", + "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28", + "/model/layers.9/post_attention_layernorm/output_3.out4_18", + "/model/layers.9/post_attention_layernorm/output_0.out4_18", + "/model/layers.9/mlp/Mul/output_0.out3_9", + "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29", + "/model/layers.10/input_layernorm/output_3.out4_19", + "/model/layers.10/input_layernorm/output_0.out4_19", + "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30", + "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10", + "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31", + "/model/layers.10/post_attention_layernorm/output_3.out4_20", + "/model/layers.10/post_attention_layernorm/output_0.out4_20", + "/model/layers.10/mlp/Mul/output_0.out3_10", + "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32", + "/model/layers.11/input_layernorm/output_3.out4_21", + "/model/layers.11/input_layernorm/output_0.out4_21", + "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33", + "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11", + "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34", + "/model/layers.11/post_attention_layernorm/output_3.out4_22", + "/model/layers.11/post_attention_layernorm/output_0.out4_22", + "/model/layers.11/mlp/Mul/output_0.out3_11", + "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35", + "/model/layers.12/input_layernorm/output_3.out4_23", + "/model/layers.12/input_layernorm/output_0.out4_23", + "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36", + "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12", + "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37", + "/model/layers.12/post_attention_layernorm/output_3.out4_24", + "/model/layers.12/post_attention_layernorm/output_0.out4_24", + "/model/layers.12/mlp/Mul/output_0.out3_12", + "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38", + "/model/layers.13/input_layernorm/output_3.out4_25", + "/model/layers.13/input_layernorm/output_0.out4_25", + "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39", + "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13", + "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40", + "/model/layers.13/post_attention_layernorm/output_3.out4_26", + "/model/layers.13/post_attention_layernorm/output_0.out4_26", + "/model/layers.13/mlp/Mul/output_0.out3_13", + "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41", + "/model/layers.14/input_layernorm/output_3.out4_27", + "/model/layers.14/input_layernorm/output_0.out4_27", + "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42", + "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14", + "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43", + "/model/layers.14/post_attention_layernorm/output_3.out4_28", + "/model/layers.14/post_attention_layernorm/output_0.out4_28", + "/model/layers.14/mlp/Mul/output_0.out3_14", + "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44", + "/model/layers.15/input_layernorm/output_3.out4_29", + "/model/layers.15/input_layernorm/output_0.out4_29", + "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45", + "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15", + "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46", + "/model/layers.15/post_attention_layernorm/output_3.out4_30", + "/model/layers.15/post_attention_layernorm/output_0.out4_30", + "/model/layers.15/mlp/Mul/output_0.out3_15", + "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47", + "/model/layers.16/input_layernorm/output_3.out4_31", + "/model/layers.16/input_layernorm/output_0.out4_31", + "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48", + "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16", + "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49", + "/model/layers.16/post_attention_layernorm/output_3.out4_32", + "/model/layers.16/post_attention_layernorm/output_0.out4_32", + "/model/layers.16/mlp/Mul/output_0.out3_16", + "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50", + "/model/layers.17/input_layernorm/output_3.out4_33", + "/model/layers.17/input_layernorm/output_0.out4_33", + "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51", + "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17", + "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52", + "/model/layers.17/post_attention_layernorm/output_3.out4_34", + "/model/layers.17/post_attention_layernorm/output_0.out4_34", + "/model/layers.17/mlp/Mul/output_0.out3_17", + "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53", + "/model/layers.18/input_layernorm/output_3.out4_35", + "/model/layers.18/input_layernorm/output_0.out4_35", + "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54", + "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18", + "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55", + "/model/layers.18/post_attention_layernorm/output_3.out4_36", + "/model/layers.18/post_attention_layernorm/output_0.out4_36", + "/model/layers.18/mlp/Mul/output_0.out3_18", + "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56", + "/model/layers.19/input_layernorm/output_3.out4_37", + "/model/layers.19/input_layernorm/output_0.out4_37", + "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57", + "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19", + "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58", + "/model/layers.19/post_attention_layernorm/output_3.out4_38", + "/model/layers.19/post_attention_layernorm/output_0.out4_38", + "/model/layers.19/mlp/Mul/output_0.out3_19", + "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59", + "/model/layers.20/input_layernorm/output_3.out4_39", + "/model/layers.20/input_layernorm/output_0.out4_39", + "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60", + "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20", + "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61", + "/model/layers.20/post_attention_layernorm/output_3.out4_40", + "/model/layers.20/post_attention_layernorm/output_0.out4_40", + "/model/layers.20/mlp/Mul/output_0.out3_20", + "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62", + "/model/layers.21/input_layernorm/output_3.out4_41", + "/model/layers.21/input_layernorm/output_0.out4_41", + "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63", + "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21", + "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64", + "/model/layers.21/post_attention_layernorm/output_3.out4_42", + "/model/layers.21/post_attention_layernorm/output_0.out4_42", + "/model/layers.21/mlp/Mul/output_0.out3_21", + "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65", + "/model/layers.22/input_layernorm/output_3.out4_43", + "/model/layers.22/input_layernorm/output_0.out4_43", + "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66", + "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22", + "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67", + "/model/layers.22/post_attention_layernorm/output_3.out4_44", + "/model/layers.22/post_attention_layernorm/output_0.out4_44", + "/model/layers.22/mlp/Mul/output_0.out3_22", + "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68", + "/model/layers.23/input_layernorm/output_3.out4_45", + "/model/layers.23/input_layernorm/output_0.out4_45", + "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69", + "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23", + "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70", + "/model/layers.23/post_attention_layernorm/output_3.out4_46", + "/model/layers.23/post_attention_layernorm/output_0.out4_46", + "/model/layers.23/mlp/Mul/output_0.out3_23", + "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71", + "/model/layers.24/input_layernorm/output_3.out4_47", + "/model/layers.24/input_layernorm/output_0.out4_47", + "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72", + "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24", + "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73", + "/model/layers.24/post_attention_layernorm/output_3.out4_48", + "/model/layers.24/post_attention_layernorm/output_0.out4_48", + "/model/layers.24/mlp/Mul/output_0.out3_24", + "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74", + "/model/layers.25/input_layernorm/output_3.out4_49", + "/model/layers.25/input_layernorm/output_0.out4_49", + "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75", + "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25", + "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76", + "/model/layers.25/post_attention_layernorm/output_3.out4_50", + "/model/layers.25/post_attention_layernorm/output_0.out4_50", + "/model/layers.25/mlp/Mul/output_0.out3_25", + "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77", + "/model/layers.26/input_layernorm/output_3.out4_51", + "/model/layers.26/input_layernorm/output_0.out4_51", + "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78", + "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26", + "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79", + "/model/layers.26/post_attention_layernorm/output_3.out4_52", + "/model/layers.26/post_attention_layernorm/output_0.out4_52", + "/model/layers.26/mlp/Mul/output_0.out3_26", + "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80", + "/model/layers.27/input_layernorm/output_3.out4_53", + "/model/layers.27/input_layernorm/output_0.out4_53", + "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81", + "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27", + "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82", + "/model/layers.27/post_attention_layernorm/output_3.out4_54", + "/model/layers.27/post_attention_layernorm/output_0.out4_54", + "/model/layers.27/mlp/Mul/output_0.out3_27", + "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83", + "/model/layers.28/input_layernorm/output_3.out4_55", + "/model/layers.28/input_layernorm/output_0.out4_55", + "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84", + "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28", + "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85", + "/model/layers.28/post_attention_layernorm/output_3.out4_56", + "/model/layers.28/post_attention_layernorm/output_0.out4_56", + "/model/layers.28/mlp/Mul/output_0.out3_28", + "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86", + "/model/layers.29/input_layernorm/output_3.out4_57", + "/model/layers.29/input_layernorm/output_0.out4_57", + "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87", + "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29", + "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88", + "/model/layers.29/post_attention_layernorm/output_3.out4_58", + "/model/layers.29/post_attention_layernorm/output_0.out4_58", + "/model/layers.29/mlp/Mul/output_0.out3_29", + "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89", + "/model/layers.30/input_layernorm/output_3.out4_59", + "/model/layers.30/input_layernorm/output_0.out4_59", + "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90", + "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30", + "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91", + "/model/layers.30/post_attention_layernorm/output_3.out4_60", + "/model/layers.30/post_attention_layernorm/output_0.out4_60", + "/model/layers.30/mlp/Mul/output_0.out3_30", + "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92", + "/model/layers.31/input_layernorm/output_3.out4_61", + "/model/layers.31/input_layernorm/output_0.out4_61", + "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93", + "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31", + "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94", + "/model/layers.31/post_attention_layernorm/output_3.out4_62", + "/model/layers.31/post_attention_layernorm/output_0.out4_62", + "/model/layers.31/mlp/Mul/output_0.out3_31", + "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95", + "/model/layers.32/final_norm_layernorm/output_0.out4_63" + ] + }, + "const": { + "buffer_size": 5503844352, + "xrt_arg_id": 3, + "packed_tensors": [ + "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.0.post_attention_layernorm.weight.bf", + "model.layers.0.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.0.mlp.up_proj.MatMulNBits.qweight", + "model.layers.0.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.0.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.0.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.1.input_layernorm.weight.bf", + "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.1.post_attention_layernorm.weight.bf", + "model.layers.1.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.1.mlp.up_proj.MatMulNBits.qweight", + "model.layers.1.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.1.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.1.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.2.input_layernorm.weight.bf", + "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.2.post_attention_layernorm.weight.bf", + "model.layers.2.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.2.mlp.up_proj.MatMulNBits.qweight", + "model.layers.2.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.2.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.2.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.3.input_layernorm.weight.bf", + "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.3.post_attention_layernorm.weight.bf", + "model.layers.3.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.3.mlp.up_proj.MatMulNBits.qweight", + "model.layers.3.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.3.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.3.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.4.input_layernorm.weight.bf", + "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.4.post_attention_layernorm.weight.bf", + "model.layers.4.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.4.mlp.up_proj.MatMulNBits.qweight", + "model.layers.4.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.4.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.4.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.5.input_layernorm.weight.bf", + "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.5.post_attention_layernorm.weight.bf", + "model.layers.5.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.5.mlp.up_proj.MatMulNBits.qweight", + "model.layers.5.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.5.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.5.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.6.input_layernorm.weight.bf", + "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.6.post_attention_layernorm.weight.bf", + "model.layers.6.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.6.mlp.up_proj.MatMulNBits.qweight", + "model.layers.6.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.6.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.6.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.7.input_layernorm.weight.bf", + "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.7.post_attention_layernorm.weight.bf", + "model.layers.7.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.7.mlp.up_proj.MatMulNBits.qweight", + "model.layers.7.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.7.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.7.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.8.input_layernorm.weight.bf", + "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.8.post_attention_layernorm.weight.bf", + "model.layers.8.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.8.mlp.up_proj.MatMulNBits.qweight", + "model.layers.8.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.8.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.8.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.9.input_layernorm.weight.bf", + "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.9.post_attention_layernorm.weight.bf", + "model.layers.9.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.9.mlp.up_proj.MatMulNBits.qweight", + "model.layers.9.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.9.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.9.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.10.input_layernorm.weight.bf", + "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.10.post_attention_layernorm.weight.bf", + "model.layers.10.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.10.mlp.up_proj.MatMulNBits.qweight", + "model.layers.10.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.10.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.10.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.11.input_layernorm.weight.bf", + "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.11.post_attention_layernorm.weight.bf", + "model.layers.11.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.11.mlp.up_proj.MatMulNBits.qweight", + "model.layers.11.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.11.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.11.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.12.input_layernorm.weight.bf", + "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.12.post_attention_layernorm.weight.bf", + "model.layers.12.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.12.mlp.up_proj.MatMulNBits.qweight", + "model.layers.12.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.12.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.12.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.13.input_layernorm.weight.bf", + "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.13.post_attention_layernorm.weight.bf", + "model.layers.13.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.13.mlp.up_proj.MatMulNBits.qweight", + "model.layers.13.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.13.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.13.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.14.input_layernorm.weight.bf", + "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.14.post_attention_layernorm.weight.bf", + "model.layers.14.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.14.mlp.up_proj.MatMulNBits.qweight", + "model.layers.14.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.14.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.14.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.15.input_layernorm.weight.bf", + "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.15.post_attention_layernorm.weight.bf", + "model.layers.15.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.15.mlp.up_proj.MatMulNBits.qweight", + "model.layers.15.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.15.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.15.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.16.input_layernorm.weight.bf", + "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.16.post_attention_layernorm.weight.bf", + "model.layers.16.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.16.mlp.up_proj.MatMulNBits.qweight", + "model.layers.16.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.16.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.16.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.17.input_layernorm.weight.bf", + "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.17.post_attention_layernorm.weight.bf", + "model.layers.17.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.17.mlp.up_proj.MatMulNBits.qweight", + "model.layers.17.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.17.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.17.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.18.input_layernorm.weight.bf", + "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.18.post_attention_layernorm.weight.bf", + "model.layers.18.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.18.mlp.up_proj.MatMulNBits.qweight", + "model.layers.18.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.18.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.18.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.19.input_layernorm.weight.bf", + "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.19.post_attention_layernorm.weight.bf", + "model.layers.19.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.19.mlp.up_proj.MatMulNBits.qweight", + "model.layers.19.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.19.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.19.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.20.input_layernorm.weight.bf", + "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.20.post_attention_layernorm.weight.bf", + "model.layers.20.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.20.mlp.up_proj.MatMulNBits.qweight", + "model.layers.20.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.20.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.20.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.21.input_layernorm.weight.bf", + "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.21.post_attention_layernorm.weight.bf", + "model.layers.21.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.21.mlp.up_proj.MatMulNBits.qweight", + "model.layers.21.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.21.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.21.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.22.input_layernorm.weight.bf", + "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.22.post_attention_layernorm.weight.bf", + "model.layers.22.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.22.mlp.up_proj.MatMulNBits.qweight", + "model.layers.22.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.22.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.22.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.23.input_layernorm.weight.bf", + "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.23.post_attention_layernorm.weight.bf", + "model.layers.23.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.23.mlp.up_proj.MatMulNBits.qweight", + "model.layers.23.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.23.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.23.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.24.input_layernorm.weight.bf", + "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.24.post_attention_layernorm.weight.bf", + "model.layers.24.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.24.mlp.up_proj.MatMulNBits.qweight", + "model.layers.24.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.24.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.24.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.25.input_layernorm.weight.bf", + "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.25.post_attention_layernorm.weight.bf", + "model.layers.25.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.25.mlp.up_proj.MatMulNBits.qweight", + "model.layers.25.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.25.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.25.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.26.input_layernorm.weight.bf", + "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.26.post_attention_layernorm.weight.bf", + "model.layers.26.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.26.mlp.up_proj.MatMulNBits.qweight", + "model.layers.26.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.26.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.26.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.27.input_layernorm.weight.bf", + "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.27.post_attention_layernorm.weight.bf", + "model.layers.27.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.27.mlp.up_proj.MatMulNBits.qweight", + "model.layers.27.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.27.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.27.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.28.input_layernorm.weight.bf", + "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.28.post_attention_layernorm.weight.bf", + "model.layers.28.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.28.mlp.up_proj.MatMulNBits.qweight", + "model.layers.28.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.28.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.28.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.29.input_layernorm.weight.bf", + "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.29.post_attention_layernorm.weight.bf", + "model.layers.29.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.29.mlp.up_proj.MatMulNBits.qweight", + "model.layers.29.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.29.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.29.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.30.input_layernorm.weight.bf", + "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.30.post_attention_layernorm.weight.bf", + "model.layers.30.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.30.mlp.up_proj.MatMulNBits.qweight", + "model.layers.30.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.30.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.30.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.31.input_layernorm.weight.bf", + "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat", + "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat", + "model.layers.31.post_attention_layernorm.weight.bf", + "model.layers.31.mlp.gate_proj.MatMulNBits.qweight", + "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f", + "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros", + "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f", + "model.layers.31.mlp.up_proj.MatMulNBits.qweight", + "model.layers.31.mlp.up_proj.MatMulNBits.scales.f", + "model.layers.31.mlp.up_proj.MatMulNBits.qzeros", + "model.layers.31.mlp.up_proj.MatMulNBits.bias.f", + "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat", + "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat", + "model.layers.32.final_norm_layernorm.weight.bf", + "lm_head.MatMulNBits.qweight.preformat", + "lm_head.MatMulNBits.bias.preformat", + "lm_head.MatMulNBits.scales.preformat", + "lm_head.MatMulNBits.qzeros.preformat" + ] + }, + "super_instr": { + "buffer_size": 0, + "xrt_arg_id": 4, + "packed_tensors": [] + }, + "ext_buf_0": { + "buffer_size": 536870912, + "xrt_arg_id": 5, + "packed_tensors": [ + "past_key_values.0.key", + "past_key_values.0.value", + "present.0.key", + "present.0.value", + "past_key_values.1.key", + "past_key_values.1.value", + "present.1.key", + "present.1.value", + "past_key_values.2.key", + "past_key_values.2.value", + "present.2.key", + "present.2.value", + "past_key_values.3.key", + "past_key_values.3.value", + "present.3.key", + "present.3.value", + "past_key_values.4.key", + "past_key_values.4.value", + "present.4.key", + "present.4.value", + "past_key_values.5.key", + "past_key_values.5.value", + "present.5.key", + "present.5.value", + "past_key_values.6.key", + "past_key_values.6.value", + "present.6.key", + "present.6.value", + "past_key_values.7.key", + "past_key_values.7.value", + "present.7.key", + "present.7.value", + "past_key_values.8.key", + "past_key_values.8.value", + "present.8.key", + "present.8.value", + "past_key_values.9.key", + "past_key_values.9.value", + "present.9.key", + "present.9.value", + "past_key_values.10.key", + "past_key_values.10.value", + "present.10.key", + "present.10.value", + "past_key_values.11.key", + "past_key_values.11.value", + "present.11.key", + "present.11.value", + "past_key_values.12.key", + "past_key_values.12.value", + "present.12.key", + "present.12.value", + "past_key_values.13.key", + "past_key_values.13.value", + "present.13.key", + "present.13.value", + "past_key_values.14.key", + "past_key_values.14.value", + "present.14.key", + "present.14.value", + "past_key_values.15.key", + "past_key_values.15.value", + "present.15.key", + "present.15.value", + "past_key_values.16.key", + "past_key_values.16.value", + "present.16.key", + "present.16.value", + "past_key_values.17.key", + "past_key_values.17.value", + "present.17.key", + "present.17.value", + "past_key_values.18.key", + "past_key_values.18.value", + "present.18.key", + "present.18.value", + "past_key_values.19.key", + "past_key_values.19.value", + "present.19.key", + "present.19.value", + "past_key_values.20.key", + "past_key_values.20.value", + "present.20.key", + "present.20.value", + "past_key_values.21.key", + "past_key_values.21.value", + "present.21.key", + "present.21.value", + "past_key_values.22.key", + "past_key_values.22.value", + "present.22.key", + "present.22.value", + "past_key_values.23.key", + "past_key_values.23.value", + "present.23.key", + "present.23.value", + "past_key_values.24.key", + "past_key_values.24.value", + "present.24.key", + "present.24.value", + "past_key_values.25.key", + "past_key_values.25.value", + "present.25.key", + "present.25.value", + "past_key_values.26.key", + "past_key_values.26.value", + "present.26.key", + "present.26.value", + "past_key_values.27.key", + "past_key_values.27.value", + "present.27.key", + "present.27.value", + "past_key_values.28.key", + "past_key_values.28.value", + "present.28.key", + "present.28.value", + "past_key_values.29.key", + "past_key_values.29.value", + "present.29.key", + "present.29.value", + "past_key_values.30.key", + "past_key_values.30.value", + "present.30.key", + "present.30.value", + "past_key_values.31.key", + "past_key_values.31.value", + "present.31.key", + "present.31.value" + ] + }, + "ext_buf_1": { + "buffer_size": 8388608, + "xrt_arg_id": 6, + "packed_tensors": [ + "sin_cos_cache_token" + ] + } + }, + "tensor_map": { + "/model/layers.0/input_layernorm/output_0.out5_4_0": { + "packed_buffer_label": "in", + "xrt_arg_id": 0, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 8192 + }, + "attention_mask_const_uint": { + "packed_buffer_label": "in", + "xrt_arg_id": 0, + "dtype": "uint32", + "shape": [ + 1 + ], + "size_in_bytes": 4, + "op_tensor_size": 4, + "offset": 24700 + }, + "/model/embed_tokens/Gather/output_0.out4_0": { + "packed_buffer_label": "in", + "xrt_arg_id": 0, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 16388 + }, + "/model/layers.32/final_norm_layernorm/output_0.dummy": { + "packed_buffer_label": "out", + "xrt_arg_id": 1, + "dtype": "float16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 0 + }, + "logits.out5_4_96": { + "packed_buffer_label": "out", + "xrt_arg_id": 1, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 32768 + ], + "size_in_bytes": 65536, + "op_tensor_size": 65536, + "offset": 8192 + }, + "/model/layers.0/attn/qk_proj/MatMulNBits/output_0.out5_4_0": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 0 + }, + "/model/layers.0/attn/GroupQueryAttention/output_0.out2_0": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 10240 + }, + "/model/layers.0/attn/o_proj/MatMulNBits/output_0.out5_4_1": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 18432 + }, + "/model/layers.0/post_attention_layernorm/output_3.out4_0": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 26624 + }, + "/model/layers.0/post_attention_layernorm/output_0.out4_0": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 34816 + }, + "/model/layers.0/mlp/Mul/output_0.out3_0": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 43008 + }, + "/model/layers.0/mlp/down_proj/MatMulNBits/output_0.out5_4_2": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 71680 + }, + "/model/layers.1/input_layernorm/output_3.out4_1": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 79872 + }, + "/model/layers.1/input_layernorm/output_0.out4_1": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 88064 + }, + "/model/layers.1/attn/qk_proj/MatMulNBits/output_0.out5_4_3": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 96256 + }, + "/model/layers.1/attn/GroupQueryAttention/output_0.out2_1": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 106496 + }, + "/model/layers.1/attn/o_proj/MatMulNBits/output_0.out5_4_4": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 114688 + }, + "/model/layers.1/post_attention_layernorm/output_3.out4_2": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 122880 + }, + "/model/layers.1/post_attention_layernorm/output_0.out4_2": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 131072 + }, + "/model/layers.1/mlp/Mul/output_0.out3_1": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 139264 + }, + "/model/layers.1/mlp/down_proj/MatMulNBits/output_0.out5_4_5": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 167936 + }, + "/model/layers.2/input_layernorm/output_3.out4_3": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 176128 + }, + "/model/layers.2/input_layernorm/output_0.out4_3": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 184320 + }, + "/model/layers.2/attn/qk_proj/MatMulNBits/output_0.out5_4_6": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 192512 + }, + "/model/layers.2/attn/GroupQueryAttention/output_0.out2_2": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 202752 + }, + "/model/layers.2/attn/o_proj/MatMulNBits/output_0.out5_4_7": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 210944 + }, + "/model/layers.2/post_attention_layernorm/output_3.out4_4": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 219136 + }, + "/model/layers.2/post_attention_layernorm/output_0.out4_4": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 227328 + }, + "/model/layers.2/mlp/Mul/output_0.out3_2": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 235520 + }, + "/model/layers.2/mlp/down_proj/MatMulNBits/output_0.out5_4_8": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 264192 + }, + "/model/layers.3/input_layernorm/output_3.out4_5": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 272384 + }, + "/model/layers.3/input_layernorm/output_0.out4_5": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 280576 + }, + "/model/layers.3/attn/qk_proj/MatMulNBits/output_0.out5_4_9": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 288768 + }, + "/model/layers.3/attn/GroupQueryAttention/output_0.out2_3": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 299008 + }, + "/model/layers.3/attn/o_proj/MatMulNBits/output_0.out5_4_10": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 307200 + }, + "/model/layers.3/post_attention_layernorm/output_3.out4_6": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 315392 + }, + "/model/layers.3/post_attention_layernorm/output_0.out4_6": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 323584 + }, + "/model/layers.3/mlp/Mul/output_0.out3_3": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 331776 + }, + "/model/layers.3/mlp/down_proj/MatMulNBits/output_0.out5_4_11": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 360448 + }, + "/model/layers.4/input_layernorm/output_3.out4_7": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 368640 + }, + "/model/layers.4/input_layernorm/output_0.out4_7": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 376832 + }, + "/model/layers.4/attn/qk_proj/MatMulNBits/output_0.out5_4_12": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 385024 + }, + "/model/layers.4/attn/GroupQueryAttention/output_0.out2_4": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 395264 + }, + "/model/layers.4/attn/o_proj/MatMulNBits/output_0.out5_4_13": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 403456 + }, + "/model/layers.4/post_attention_layernorm/output_3.out4_8": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 411648 + }, + "/model/layers.4/post_attention_layernorm/output_0.out4_8": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 419840 + }, + "/model/layers.4/mlp/Mul/output_0.out3_4": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 428032 + }, + "/model/layers.4/mlp/down_proj/MatMulNBits/output_0.out5_4_14": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 456704 + }, + "/model/layers.5/input_layernorm/output_3.out4_9": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 464896 + }, + "/model/layers.5/input_layernorm/output_0.out4_9": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 473088 + }, + "/model/layers.5/attn/qk_proj/MatMulNBits/output_0.out5_4_15": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 481280 + }, + "/model/layers.5/attn/GroupQueryAttention/output_0.out2_5": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 491520 + }, + "/model/layers.5/attn/o_proj/MatMulNBits/output_0.out5_4_16": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 499712 + }, + "/model/layers.5/post_attention_layernorm/output_3.out4_10": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 507904 + }, + "/model/layers.5/post_attention_layernorm/output_0.out4_10": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 516096 + }, + "/model/layers.5/mlp/Mul/output_0.out3_5": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 524288 + }, + "/model/layers.5/mlp/down_proj/MatMulNBits/output_0.out5_4_17": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 552960 + }, + "/model/layers.6/input_layernorm/output_3.out4_11": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 561152 + }, + "/model/layers.6/input_layernorm/output_0.out4_11": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 569344 + }, + "/model/layers.6/attn/qk_proj/MatMulNBits/output_0.out5_4_18": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 577536 + }, + "/model/layers.6/attn/GroupQueryAttention/output_0.out2_6": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 587776 + }, + "/model/layers.6/attn/o_proj/MatMulNBits/output_0.out5_4_19": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 595968 + }, + "/model/layers.6/post_attention_layernorm/output_3.out4_12": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 604160 + }, + "/model/layers.6/post_attention_layernorm/output_0.out4_12": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 612352 + }, + "/model/layers.6/mlp/Mul/output_0.out3_6": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 620544 + }, + "/model/layers.6/mlp/down_proj/MatMulNBits/output_0.out5_4_20": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 649216 + }, + "/model/layers.7/input_layernorm/output_3.out4_13": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 657408 + }, + "/model/layers.7/input_layernorm/output_0.out4_13": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 665600 + }, + "/model/layers.7/attn/qk_proj/MatMulNBits/output_0.out5_4_21": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 673792 + }, + "/model/layers.7/attn/GroupQueryAttention/output_0.out2_7": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 684032 + }, + "/model/layers.7/attn/o_proj/MatMulNBits/output_0.out5_4_22": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 692224 + }, + "/model/layers.7/post_attention_layernorm/output_3.out4_14": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 700416 + }, + "/model/layers.7/post_attention_layernorm/output_0.out4_14": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 708608 + }, + "/model/layers.7/mlp/Mul/output_0.out3_7": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 716800 + }, + "/model/layers.7/mlp/down_proj/MatMulNBits/output_0.out5_4_23": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 745472 + }, + "/model/layers.8/input_layernorm/output_3.out4_15": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 753664 + }, + "/model/layers.8/input_layernorm/output_0.out4_15": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 761856 + }, + "/model/layers.8/attn/qk_proj/MatMulNBits/output_0.out5_4_24": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 770048 + }, + "/model/layers.8/attn/GroupQueryAttention/output_0.out2_8": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 780288 + }, + "/model/layers.8/attn/o_proj/MatMulNBits/output_0.out5_4_25": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 788480 + }, + "/model/layers.8/post_attention_layernorm/output_3.out4_16": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 796672 + }, + "/model/layers.8/post_attention_layernorm/output_0.out4_16": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 804864 + }, + "/model/layers.8/mlp/Mul/output_0.out3_8": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 813056 + }, + "/model/layers.8/mlp/down_proj/MatMulNBits/output_0.out5_4_26": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 841728 + }, + "/model/layers.9/input_layernorm/output_3.out4_17": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 849920 + }, + "/model/layers.9/input_layernorm/output_0.out4_17": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 858112 + }, + "/model/layers.9/attn/qk_proj/MatMulNBits/output_0.out5_4_27": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 866304 + }, + "/model/layers.9/attn/GroupQueryAttention/output_0.out2_9": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 876544 + }, + "/model/layers.9/attn/o_proj/MatMulNBits/output_0.out5_4_28": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 884736 + }, + "/model/layers.9/post_attention_layernorm/output_3.out4_18": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 892928 + }, + "/model/layers.9/post_attention_layernorm/output_0.out4_18": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 901120 + }, + "/model/layers.9/mlp/Mul/output_0.out3_9": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 909312 + }, + "/model/layers.9/mlp/down_proj/MatMulNBits/output_0.out5_4_29": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 937984 + }, + "/model/layers.10/input_layernorm/output_3.out4_19": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 946176 + }, + "/model/layers.10/input_layernorm/output_0.out4_19": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 954368 + }, + "/model/layers.10/attn/qk_proj/MatMulNBits/output_0.out5_4_30": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 962560 + }, + "/model/layers.10/attn/GroupQueryAttention/output_0.out2_10": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 972800 + }, + "/model/layers.10/attn/o_proj/MatMulNBits/output_0.out5_4_31": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 980992 + }, + "/model/layers.10/post_attention_layernorm/output_3.out4_20": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 989184 + }, + "/model/layers.10/post_attention_layernorm/output_0.out4_20": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 997376 + }, + "/model/layers.10/mlp/Mul/output_0.out3_10": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1005568 + }, + "/model/layers.10/mlp/down_proj/MatMulNBits/output_0.out5_4_32": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1034240 + }, + "/model/layers.11/input_layernorm/output_3.out4_21": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1042432 + }, + "/model/layers.11/input_layernorm/output_0.out4_21": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1050624 + }, + "/model/layers.11/attn/qk_proj/MatMulNBits/output_0.out5_4_33": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1058816 + }, + "/model/layers.11/attn/GroupQueryAttention/output_0.out2_11": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1069056 + }, + "/model/layers.11/attn/o_proj/MatMulNBits/output_0.out5_4_34": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1077248 + }, + "/model/layers.11/post_attention_layernorm/output_3.out4_22": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1085440 + }, + "/model/layers.11/post_attention_layernorm/output_0.out4_22": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1093632 + }, + "/model/layers.11/mlp/Mul/output_0.out3_11": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1101824 + }, + "/model/layers.11/mlp/down_proj/MatMulNBits/output_0.out5_4_35": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1130496 + }, + "/model/layers.12/input_layernorm/output_3.out4_23": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1138688 + }, + "/model/layers.12/input_layernorm/output_0.out4_23": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1146880 + }, + "/model/layers.12/attn/qk_proj/MatMulNBits/output_0.out5_4_36": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1155072 + }, + "/model/layers.12/attn/GroupQueryAttention/output_0.out2_12": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1165312 + }, + "/model/layers.12/attn/o_proj/MatMulNBits/output_0.out5_4_37": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1173504 + }, + "/model/layers.12/post_attention_layernorm/output_3.out4_24": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1181696 + }, + "/model/layers.12/post_attention_layernorm/output_0.out4_24": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1189888 + }, + "/model/layers.12/mlp/Mul/output_0.out3_12": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1198080 + }, + "/model/layers.12/mlp/down_proj/MatMulNBits/output_0.out5_4_38": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1226752 + }, + "/model/layers.13/input_layernorm/output_3.out4_25": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1234944 + }, + "/model/layers.13/input_layernorm/output_0.out4_25": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1243136 + }, + "/model/layers.13/attn/qk_proj/MatMulNBits/output_0.out5_4_39": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1251328 + }, + "/model/layers.13/attn/GroupQueryAttention/output_0.out2_13": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1261568 + }, + "/model/layers.13/attn/o_proj/MatMulNBits/output_0.out5_4_40": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1269760 + }, + "/model/layers.13/post_attention_layernorm/output_3.out4_26": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1277952 + }, + "/model/layers.13/post_attention_layernorm/output_0.out4_26": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1286144 + }, + "/model/layers.13/mlp/Mul/output_0.out3_13": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1294336 + }, + "/model/layers.13/mlp/down_proj/MatMulNBits/output_0.out5_4_41": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1323008 + }, + "/model/layers.14/input_layernorm/output_3.out4_27": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1331200 + }, + "/model/layers.14/input_layernorm/output_0.out4_27": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1339392 + }, + "/model/layers.14/attn/qk_proj/MatMulNBits/output_0.out5_4_42": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1347584 + }, + "/model/layers.14/attn/GroupQueryAttention/output_0.out2_14": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1357824 + }, + "/model/layers.14/attn/o_proj/MatMulNBits/output_0.out5_4_43": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1366016 + }, + "/model/layers.14/post_attention_layernorm/output_3.out4_28": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1374208 + }, + "/model/layers.14/post_attention_layernorm/output_0.out4_28": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1382400 + }, + "/model/layers.14/mlp/Mul/output_0.out3_14": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1390592 + }, + "/model/layers.14/mlp/down_proj/MatMulNBits/output_0.out5_4_44": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1419264 + }, + "/model/layers.15/input_layernorm/output_3.out4_29": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1427456 + }, + "/model/layers.15/input_layernorm/output_0.out4_29": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1435648 + }, + "/model/layers.15/attn/qk_proj/MatMulNBits/output_0.out5_4_45": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1443840 + }, + "/model/layers.15/attn/GroupQueryAttention/output_0.out2_15": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1454080 + }, + "/model/layers.15/attn/o_proj/MatMulNBits/output_0.out5_4_46": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1462272 + }, + "/model/layers.15/post_attention_layernorm/output_3.out4_30": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1470464 + }, + "/model/layers.15/post_attention_layernorm/output_0.out4_30": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1478656 + }, + "/model/layers.15/mlp/Mul/output_0.out3_15": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1486848 + }, + "/model/layers.15/mlp/down_proj/MatMulNBits/output_0.out5_4_47": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1515520 + }, + "/model/layers.16/input_layernorm/output_3.out4_31": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1523712 + }, + "/model/layers.16/input_layernorm/output_0.out4_31": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1531904 + }, + "/model/layers.16/attn/qk_proj/MatMulNBits/output_0.out5_4_48": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1540096 + }, + "/model/layers.16/attn/GroupQueryAttention/output_0.out2_16": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1550336 + }, + "/model/layers.16/attn/o_proj/MatMulNBits/output_0.out5_4_49": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1558528 + }, + "/model/layers.16/post_attention_layernorm/output_3.out4_32": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1566720 + }, + "/model/layers.16/post_attention_layernorm/output_0.out4_32": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1574912 + }, + "/model/layers.16/mlp/Mul/output_0.out3_16": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1583104 + }, + "/model/layers.16/mlp/down_proj/MatMulNBits/output_0.out5_4_50": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1611776 + }, + "/model/layers.17/input_layernorm/output_3.out4_33": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1619968 + }, + "/model/layers.17/input_layernorm/output_0.out4_33": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1628160 + }, + "/model/layers.17/attn/qk_proj/MatMulNBits/output_0.out5_4_51": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1636352 + }, + "/model/layers.17/attn/GroupQueryAttention/output_0.out2_17": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1646592 + }, + "/model/layers.17/attn/o_proj/MatMulNBits/output_0.out5_4_52": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1654784 + }, + "/model/layers.17/post_attention_layernorm/output_3.out4_34": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1662976 + }, + "/model/layers.17/post_attention_layernorm/output_0.out4_34": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1671168 + }, + "/model/layers.17/mlp/Mul/output_0.out3_17": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1679360 + }, + "/model/layers.17/mlp/down_proj/MatMulNBits/output_0.out5_4_53": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1708032 + }, + "/model/layers.18/input_layernorm/output_3.out4_35": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1716224 + }, + "/model/layers.18/input_layernorm/output_0.out4_35": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1724416 + }, + "/model/layers.18/attn/qk_proj/MatMulNBits/output_0.out5_4_54": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1732608 + }, + "/model/layers.18/attn/GroupQueryAttention/output_0.out2_18": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1742848 + }, + "/model/layers.18/attn/o_proj/MatMulNBits/output_0.out5_4_55": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1751040 + }, + "/model/layers.18/post_attention_layernorm/output_3.out4_36": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1759232 + }, + "/model/layers.18/post_attention_layernorm/output_0.out4_36": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1767424 + }, + "/model/layers.18/mlp/Mul/output_0.out3_18": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1775616 + }, + "/model/layers.18/mlp/down_proj/MatMulNBits/output_0.out5_4_56": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1804288 + }, + "/model/layers.19/input_layernorm/output_3.out4_37": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1812480 + }, + "/model/layers.19/input_layernorm/output_0.out4_37": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1820672 + }, + "/model/layers.19/attn/qk_proj/MatMulNBits/output_0.out5_4_57": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1828864 + }, + "/model/layers.19/attn/GroupQueryAttention/output_0.out2_19": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1839104 + }, + "/model/layers.19/attn/o_proj/MatMulNBits/output_0.out5_4_58": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1847296 + }, + "/model/layers.19/post_attention_layernorm/output_3.out4_38": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1855488 + }, + "/model/layers.19/post_attention_layernorm/output_0.out4_38": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1863680 + }, + "/model/layers.19/mlp/Mul/output_0.out3_19": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1871872 + }, + "/model/layers.19/mlp/down_proj/MatMulNBits/output_0.out5_4_59": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1900544 + }, + "/model/layers.20/input_layernorm/output_3.out4_39": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1908736 + }, + "/model/layers.20/input_layernorm/output_0.out4_39": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1916928 + }, + "/model/layers.20/attn/qk_proj/MatMulNBits/output_0.out5_4_60": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 1925120 + }, + "/model/layers.20/attn/GroupQueryAttention/output_0.out2_20": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1935360 + }, + "/model/layers.20/attn/o_proj/MatMulNBits/output_0.out5_4_61": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1943552 + }, + "/model/layers.20/post_attention_layernorm/output_3.out4_40": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1951744 + }, + "/model/layers.20/post_attention_layernorm/output_0.out4_40": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1959936 + }, + "/model/layers.20/mlp/Mul/output_0.out3_20": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 1968128 + }, + "/model/layers.20/mlp/down_proj/MatMulNBits/output_0.out5_4_62": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1996800 + }, + "/model/layers.21/input_layernorm/output_3.out4_41": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2004992 + }, + "/model/layers.21/input_layernorm/output_0.out4_41": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2013184 + }, + "/model/layers.21/attn/qk_proj/MatMulNBits/output_0.out5_4_63": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2021376 + }, + "/model/layers.21/attn/GroupQueryAttention/output_0.out2_21": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2031616 + }, + "/model/layers.21/attn/o_proj/MatMulNBits/output_0.out5_4_64": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2039808 + }, + "/model/layers.21/post_attention_layernorm/output_3.out4_42": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2048000 + }, + "/model/layers.21/post_attention_layernorm/output_0.out4_42": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2056192 + }, + "/model/layers.21/mlp/Mul/output_0.out3_21": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2064384 + }, + "/model/layers.21/mlp/down_proj/MatMulNBits/output_0.out5_4_65": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2093056 + }, + "/model/layers.22/input_layernorm/output_3.out4_43": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2101248 + }, + "/model/layers.22/input_layernorm/output_0.out4_43": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2109440 + }, + "/model/layers.22/attn/qk_proj/MatMulNBits/output_0.out5_4_66": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2117632 + }, + "/model/layers.22/attn/GroupQueryAttention/output_0.out2_22": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2127872 + }, + "/model/layers.22/attn/o_proj/MatMulNBits/output_0.out5_4_67": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2136064 + }, + "/model/layers.22/post_attention_layernorm/output_3.out4_44": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2144256 + }, + "/model/layers.22/post_attention_layernorm/output_0.out4_44": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2152448 + }, + "/model/layers.22/mlp/Mul/output_0.out3_22": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2160640 + }, + "/model/layers.22/mlp/down_proj/MatMulNBits/output_0.out5_4_68": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2189312 + }, + "/model/layers.23/input_layernorm/output_3.out4_45": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2197504 + }, + "/model/layers.23/input_layernorm/output_0.out4_45": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2205696 + }, + "/model/layers.23/attn/qk_proj/MatMulNBits/output_0.out5_4_69": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2213888 + }, + "/model/layers.23/attn/GroupQueryAttention/output_0.out2_23": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2224128 + }, + "/model/layers.23/attn/o_proj/MatMulNBits/output_0.out5_4_70": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2232320 + }, + "/model/layers.23/post_attention_layernorm/output_3.out4_46": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2240512 + }, + "/model/layers.23/post_attention_layernorm/output_0.out4_46": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2248704 + }, + "/model/layers.23/mlp/Mul/output_0.out3_23": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2256896 + }, + "/model/layers.23/mlp/down_proj/MatMulNBits/output_0.out5_4_71": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2285568 + }, + "/model/layers.24/input_layernorm/output_3.out4_47": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2293760 + }, + "/model/layers.24/input_layernorm/output_0.out4_47": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2301952 + }, + "/model/layers.24/attn/qk_proj/MatMulNBits/output_0.out5_4_72": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2310144 + }, + "/model/layers.24/attn/GroupQueryAttention/output_0.out2_24": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2320384 + }, + "/model/layers.24/attn/o_proj/MatMulNBits/output_0.out5_4_73": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2328576 + }, + "/model/layers.24/post_attention_layernorm/output_3.out4_48": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2336768 + }, + "/model/layers.24/post_attention_layernorm/output_0.out4_48": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2344960 + }, + "/model/layers.24/mlp/Mul/output_0.out3_24": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2353152 + }, + "/model/layers.24/mlp/down_proj/MatMulNBits/output_0.out5_4_74": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2381824 + }, + "/model/layers.25/input_layernorm/output_3.out4_49": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2390016 + }, + "/model/layers.25/input_layernorm/output_0.out4_49": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2398208 + }, + "/model/layers.25/attn/qk_proj/MatMulNBits/output_0.out5_4_75": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2406400 + }, + "/model/layers.25/attn/GroupQueryAttention/output_0.out2_25": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2416640 + }, + "/model/layers.25/attn/o_proj/MatMulNBits/output_0.out5_4_76": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2424832 + }, + "/model/layers.25/post_attention_layernorm/output_3.out4_50": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2433024 + }, + "/model/layers.25/post_attention_layernorm/output_0.out4_50": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2441216 + }, + "/model/layers.25/mlp/Mul/output_0.out3_25": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2449408 + }, + "/model/layers.25/mlp/down_proj/MatMulNBits/output_0.out5_4_77": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2478080 + }, + "/model/layers.26/input_layernorm/output_3.out4_51": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2486272 + }, + "/model/layers.26/input_layernorm/output_0.out4_51": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2494464 + }, + "/model/layers.26/attn/qk_proj/MatMulNBits/output_0.out5_4_78": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2502656 + }, + "/model/layers.26/attn/GroupQueryAttention/output_0.out2_26": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2512896 + }, + "/model/layers.26/attn/o_proj/MatMulNBits/output_0.out5_4_79": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2521088 + }, + "/model/layers.26/post_attention_layernorm/output_3.out4_52": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2529280 + }, + "/model/layers.26/post_attention_layernorm/output_0.out4_52": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2537472 + }, + "/model/layers.26/mlp/Mul/output_0.out3_26": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2545664 + }, + "/model/layers.26/mlp/down_proj/MatMulNBits/output_0.out5_4_80": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2574336 + }, + "/model/layers.27/input_layernorm/output_3.out4_53": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2582528 + }, + "/model/layers.27/input_layernorm/output_0.out4_53": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2590720 + }, + "/model/layers.27/attn/qk_proj/MatMulNBits/output_0.out5_4_81": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2598912 + }, + "/model/layers.27/attn/GroupQueryAttention/output_0.out2_27": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2609152 + }, + "/model/layers.27/attn/o_proj/MatMulNBits/output_0.out5_4_82": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2617344 + }, + "/model/layers.27/post_attention_layernorm/output_3.out4_54": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2625536 + }, + "/model/layers.27/post_attention_layernorm/output_0.out4_54": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2633728 + }, + "/model/layers.27/mlp/Mul/output_0.out3_27": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2641920 + }, + "/model/layers.27/mlp/down_proj/MatMulNBits/output_0.out5_4_83": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2670592 + }, + "/model/layers.28/input_layernorm/output_3.out4_55": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2678784 + }, + "/model/layers.28/input_layernorm/output_0.out4_55": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2686976 + }, + "/model/layers.28/attn/qk_proj/MatMulNBits/output_0.out5_4_84": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2695168 + }, + "/model/layers.28/attn/GroupQueryAttention/output_0.out2_28": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2705408 + }, + "/model/layers.28/attn/o_proj/MatMulNBits/output_0.out5_4_85": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2713600 + }, + "/model/layers.28/post_attention_layernorm/output_3.out4_56": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2721792 + }, + "/model/layers.28/post_attention_layernorm/output_0.out4_56": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2729984 + }, + "/model/layers.28/mlp/Mul/output_0.out3_28": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2738176 + }, + "/model/layers.28/mlp/down_proj/MatMulNBits/output_0.out5_4_86": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2766848 + }, + "/model/layers.29/input_layernorm/output_3.out4_57": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2775040 + }, + "/model/layers.29/input_layernorm/output_0.out4_57": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2783232 + }, + "/model/layers.29/attn/qk_proj/MatMulNBits/output_0.out5_4_87": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2791424 + }, + "/model/layers.29/attn/GroupQueryAttention/output_0.out2_29": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2801664 + }, + "/model/layers.29/attn/o_proj/MatMulNBits/output_0.out5_4_88": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2809856 + }, + "/model/layers.29/post_attention_layernorm/output_3.out4_58": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2818048 + }, + "/model/layers.29/post_attention_layernorm/output_0.out4_58": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2826240 + }, + "/model/layers.29/mlp/Mul/output_0.out3_29": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2834432 + }, + "/model/layers.29/mlp/down_proj/MatMulNBits/output_0.out5_4_89": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2863104 + }, + "/model/layers.30/input_layernorm/output_3.out4_59": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2871296 + }, + "/model/layers.30/input_layernorm/output_0.out4_59": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2879488 + }, + "/model/layers.30/attn/qk_proj/MatMulNBits/output_0.out5_4_90": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2887680 + }, + "/model/layers.30/attn/GroupQueryAttention/output_0.out2_30": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2897920 + }, + "/model/layers.30/attn/o_proj/MatMulNBits/output_0.out5_4_91": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2906112 + }, + "/model/layers.30/post_attention_layernorm/output_3.out4_60": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2914304 + }, + "/model/layers.30/post_attention_layernorm/output_0.out4_60": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2922496 + }, + "/model/layers.30/mlp/Mul/output_0.out3_30": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 2930688 + }, + "/model/layers.30/mlp/down_proj/MatMulNBits/output_0.out5_4_92": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2959360 + }, + "/model/layers.31/input_layernorm/output_3.out4_61": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2967552 + }, + "/model/layers.31/input_layernorm/output_0.out4_61": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2975744 + }, + "/model/layers.31/attn/qk_proj/MatMulNBits/output_0.out5_4_93": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 5120 + ], + "size_in_bytes": 10240, + "op_tensor_size": 10240, + "offset": 2983936 + }, + "/model/layers.31/attn/GroupQueryAttention/output_0.out2_31": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2994176 + }, + "/model/layers.31/attn/o_proj/MatMulNBits/output_0.out5_4_94": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3002368 + }, + "/model/layers.31/post_attention_layernorm/output_3.out4_62": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3010560 + }, + "/model/layers.31/post_attention_layernorm/output_0.out4_62": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3018752 + }, + "/model/layers.31/mlp/Mul/output_0.out3_31": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 14336 + ], + "size_in_bytes": 28672, + "op_tensor_size": 28672, + "offset": 3026944 + }, + "/model/layers.31/mlp/down_proj/MatMulNBits/output_0.out5_4_95": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3055616 + }, + "/model/layers.32/final_norm_layernorm/output_0.out4_63": { + "packed_buffer_label": "scratch", + "xrt_arg_id": 2, + "dtype": "bfloat16", + "shape": [ + 1, + 1, + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3063808 + }, + "model.layers.0.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 0, + "file_name": ".cache\\MatMulNBits_2_0_0.const", + "file_size": 20971520 + }, + "model.layers.0.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 20971520, + "file_name": ".cache\\MatMulNBits_2_0_1.const", + "file_size": 20480 + }, + "model.layers.0.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 20992000, + "file_name": ".cache\\MatMulNBits_2_0_2.const", + "file_size": 655360 + }, + "model.layers.0.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 21647360, + "file_name": ".cache\\MatMulNBits_2_0_3.const", + "file_size": 163840 + }, + "model.layers.0.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 21811200, + "file_name": ".cache\\MatMulNBits_2_0_4.const", + "file_size": 4194304 + }, + "model.layers.0.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 26005504, + "file_name": ".cache\\MatMulNBits_2_0_5.const", + "file_size": 4096 + }, + "model.layers.0.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 26009600, + "file_name": ".cache\\MatMulNBits_2_0_6.const", + "file_size": 131072 + }, + "model.layers.0.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 26140672, + "file_name": ".cache\\MatMulNBits_2_0_7.const", + "file_size": 32768 + }, + "model.layers.0.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 26173440, + "file_name": ".cache\\MatMulNBits_2_0_8.const", + "file_size": 16777216 + }, + "model.layers.0.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 42950656, + "file_name": ".cache\\MatMulNBits_2_0_9.const", + "file_size": 16384 + }, + "model.layers.0.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 42967040, + "file_name": ".cache\\MatMulNBits_2_0_10.const", + "file_size": 524288 + }, + "model.layers.0.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 43491328, + "file_name": ".cache\\MatMulNBits_2_0_11.const", + "file_size": 131072 + }, + "model.layers.0.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 43622400, + "file_name": ".cache\\MatMulNBits_2_0_12.const", + "file_size": 8192 + }, + "model.layers.0.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 43630592, + "file_name": ".cache\\MatMulNBits_2_0_13.const", + "file_size": 29360128 + }, + "model.layers.0.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 72990720, + "file_name": ".cache\\MatMulNBits_2_0_14.const", + "file_size": 1835008 + }, + "model.layers.0.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 74825728, + "file_name": ".cache\\MatMulNBits_2_0_15.const", + "file_size": 229376 + }, + "model.layers.0.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 75055104, + "file_name": ".cache\\MatMulNBits_2_0_16.const", + "file_size": 57344 + }, + "model.layers.0.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 75112448, + "file_name": ".cache\\MatMulNBits_2_0_17.const", + "file_size": 29360128 + }, + "model.layers.0.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 104472576, + "file_name": ".cache\\MatMulNBits_2_0_18.const", + "file_size": 1835008 + }, + "model.layers.0.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 106307584, + "file_name": ".cache\\MatMulNBits_2_0_19.const", + "file_size": 229376 + }, + "model.layers.0.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 106536960, + "file_name": ".cache\\MatMulNBits_2_0_20.const", + "file_size": 57344 + }, + "model.layers.0.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 106594304, + "file_name": ".cache\\MatMulNBits_2_0_21.const", + "file_size": 58720256 + }, + "model.layers.0.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 165314560, + "file_name": ".cache\\MatMulNBits_2_0_22.const", + "file_size": 16384 + }, + "model.layers.0.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 165330944, + "file_name": ".cache\\MatMulNBits_2_0_23.const", + "file_size": 1835008 + }, + "model.layers.0.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 167165952, + "file_name": ".cache\\MatMulNBits_2_0_24.const", + "file_size": 458752 + }, + "model.layers.1.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 167624704, + "file_name": ".cache\\MatMulNBits_2_0_25.const", + "file_size": 8192 + }, + "model.layers.1.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 167632896, + "file_name": ".cache\\MatMulNBits_2_0_26.const", + "file_size": 20971520 + }, + "model.layers.1.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 188604416, + "file_name": ".cache\\MatMulNBits_2_0_27.const", + "file_size": 20480 + }, + "model.layers.1.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 188624896, + "file_name": ".cache\\MatMulNBits_2_0_28.const", + "file_size": 655360 + }, + "model.layers.1.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 189280256, + "file_name": ".cache\\MatMulNBits_2_0_29.const", + "file_size": 163840 + }, + "model.layers.1.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 189444096, + "file_name": ".cache\\MatMulNBits_2_0_30.const", + "file_size": 4194304 + }, + "model.layers.1.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 193638400, + "file_name": ".cache\\MatMulNBits_2_0_31.const", + "file_size": 4096 + }, + "model.layers.1.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 193642496, + "file_name": ".cache\\MatMulNBits_2_0_32.const", + "file_size": 131072 + }, + "model.layers.1.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 193773568, + "file_name": ".cache\\MatMulNBits_2_0_33.const", + "file_size": 32768 + }, + "model.layers.1.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 193806336, + "file_name": ".cache\\MatMulNBits_2_0_34.const", + "file_size": 16777216 + }, + "model.layers.1.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 210583552, + "file_name": ".cache\\MatMulNBits_2_0_35.const", + "file_size": 16384 + }, + "model.layers.1.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 210599936, + "file_name": ".cache\\MatMulNBits_2_0_36.const", + "file_size": 524288 + }, + "model.layers.1.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 211124224, + "file_name": ".cache\\MatMulNBits_2_0_37.const", + "file_size": 131072 + }, + "model.layers.1.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 211255296, + "file_name": ".cache\\MatMulNBits_2_0_38.const", + "file_size": 8192 + }, + "model.layers.1.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 211263488, + "file_name": ".cache\\MatMulNBits_2_0_39.const", + "file_size": 29360128 + }, + "model.layers.1.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 240623616, + "file_name": ".cache\\MatMulNBits_2_0_40.const", + "file_size": 1835008 + }, + "model.layers.1.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 242458624, + "file_name": ".cache\\MatMulNBits_2_0_41.const", + "file_size": 229376 + }, + "model.layers.1.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 242688000, + "file_name": ".cache\\MatMulNBits_2_0_42.const", + "file_size": 57344 + }, + "model.layers.1.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 242745344, + "file_name": ".cache\\MatMulNBits_2_0_43.const", + "file_size": 29360128 + }, + "model.layers.1.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 272105472, + "file_name": ".cache\\MatMulNBits_2_0_44.const", + "file_size": 1835008 + }, + "model.layers.1.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 273940480, + "file_name": ".cache\\MatMulNBits_2_0_45.const", + "file_size": 229376 + }, + "model.layers.1.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 274169856, + "file_name": ".cache\\MatMulNBits_2_0_46.const", + "file_size": 57344 + }, + "model.layers.1.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 274227200, + "file_name": ".cache\\MatMulNBits_2_0_47.const", + "file_size": 58720256 + }, + "model.layers.1.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 332947456, + "file_name": ".cache\\MatMulNBits_2_0_48.const", + "file_size": 16384 + }, + "model.layers.1.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 332963840, + "file_name": ".cache\\MatMulNBits_2_0_49.const", + "file_size": 1835008 + }, + "model.layers.1.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 334798848, + "file_name": ".cache\\MatMulNBits_2_0_50.const", + "file_size": 458752 + }, + "model.layers.2.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 335257600, + "file_name": ".cache\\MatMulNBits_2_0_51.const", + "file_size": 8192 + }, + "model.layers.2.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 335265792, + "file_name": ".cache\\MatMulNBits_2_0_52.const", + "file_size": 20971520 + }, + "model.layers.2.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 356237312, + "file_name": ".cache\\MatMulNBits_2_0_53.const", + "file_size": 20480 + }, + "model.layers.2.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 356257792, + "file_name": ".cache\\MatMulNBits_2_0_54.const", + "file_size": 655360 + }, + "model.layers.2.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 356913152, + "file_name": ".cache\\MatMulNBits_2_0_55.const", + "file_size": 163840 + }, + "model.layers.2.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 357076992, + "file_name": ".cache\\MatMulNBits_2_0_56.const", + "file_size": 4194304 + }, + "model.layers.2.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 361271296, + "file_name": ".cache\\MatMulNBits_2_0_57.const", + "file_size": 4096 + }, + "model.layers.2.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 361275392, + "file_name": ".cache\\MatMulNBits_2_0_58.const", + "file_size": 131072 + }, + "model.layers.2.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 361406464, + "file_name": ".cache\\MatMulNBits_2_0_59.const", + "file_size": 32768 + }, + "model.layers.2.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 361439232, + "file_name": ".cache\\MatMulNBits_2_0_60.const", + "file_size": 16777216 + }, + "model.layers.2.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 378216448, + "file_name": ".cache\\MatMulNBits_2_0_61.const", + "file_size": 16384 + }, + "model.layers.2.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 378232832, + "file_name": ".cache\\MatMulNBits_2_0_62.const", + "file_size": 524288 + }, + "model.layers.2.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 378757120, + "file_name": ".cache\\MatMulNBits_2_0_63.const", + "file_size": 131072 + }, + "model.layers.2.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 378888192, + "file_name": ".cache\\MatMulNBits_2_0_64.const", + "file_size": 8192 + }, + "model.layers.2.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 378896384, + "file_name": ".cache\\MatMulNBits_2_0_65.const", + "file_size": 29360128 + }, + "model.layers.2.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 408256512, + "file_name": ".cache\\MatMulNBits_2_0_66.const", + "file_size": 1835008 + }, + "model.layers.2.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 410091520, + "file_name": ".cache\\MatMulNBits_2_0_67.const", + "file_size": 229376 + }, + "model.layers.2.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 410320896, + "file_name": ".cache\\MatMulNBits_2_0_68.const", + "file_size": 57344 + }, + "model.layers.2.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 410378240, + "file_name": ".cache\\MatMulNBits_2_0_69.const", + "file_size": 29360128 + }, + "model.layers.2.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 439738368, + "file_name": ".cache\\MatMulNBits_2_0_70.const", + "file_size": 1835008 + }, + "model.layers.2.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 441573376, + "file_name": ".cache\\MatMulNBits_2_0_71.const", + "file_size": 229376 + }, + "model.layers.2.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 441802752, + "file_name": ".cache\\MatMulNBits_2_0_72.const", + "file_size": 57344 + }, + "model.layers.2.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 441860096, + "file_name": ".cache\\MatMulNBits_2_0_73.const", + "file_size": 58720256 + }, + "model.layers.2.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 500580352, + "file_name": ".cache\\MatMulNBits_2_0_74.const", + "file_size": 16384 + }, + "model.layers.2.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 500596736, + "file_name": ".cache\\MatMulNBits_2_0_75.const", + "file_size": 1835008 + }, + "model.layers.2.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 502431744, + "file_name": ".cache\\MatMulNBits_2_0_76.const", + "file_size": 458752 + }, + "model.layers.3.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 502890496, + "file_name": ".cache\\MatMulNBits_2_0_77.const", + "file_size": 8192 + }, + "model.layers.3.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 502898688, + "file_name": ".cache\\MatMulNBits_2_0_78.const", + "file_size": 20971520 + }, + "model.layers.3.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 523870208, + "file_name": ".cache\\MatMulNBits_2_0_79.const", + "file_size": 20480 + }, + "model.layers.3.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 523890688, + "file_name": ".cache\\MatMulNBits_2_0_80.const", + "file_size": 655360 + }, + "model.layers.3.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 524546048, + "file_name": ".cache\\MatMulNBits_2_0_81.const", + "file_size": 163840 + }, + "model.layers.3.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 524709888, + "file_name": ".cache\\MatMulNBits_2_0_82.const", + "file_size": 4194304 + }, + "model.layers.3.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 528904192, + "file_name": ".cache\\MatMulNBits_2_0_83.const", + "file_size": 4096 + }, + "model.layers.3.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 528908288, + "file_name": ".cache\\MatMulNBits_2_0_84.const", + "file_size": 131072 + }, + "model.layers.3.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 529039360, + "file_name": ".cache\\MatMulNBits_2_0_85.const", + "file_size": 32768 + }, + "model.layers.3.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 529072128, + "file_name": ".cache\\MatMulNBits_2_0_86.const", + "file_size": 16777216 + }, + "model.layers.3.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 545849344, + "file_name": ".cache\\MatMulNBits_2_0_87.const", + "file_size": 16384 + }, + "model.layers.3.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 545865728, + "file_name": ".cache\\MatMulNBits_2_0_88.const", + "file_size": 524288 + }, + "model.layers.3.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 546390016, + "file_name": ".cache\\MatMulNBits_2_0_89.const", + "file_size": 131072 + }, + "model.layers.3.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 546521088, + "file_name": ".cache\\MatMulNBits_2_0_90.const", + "file_size": 8192 + }, + "model.layers.3.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 546529280, + "file_name": ".cache\\MatMulNBits_2_0_91.const", + "file_size": 29360128 + }, + "model.layers.3.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 575889408, + "file_name": ".cache\\MatMulNBits_2_0_92.const", + "file_size": 1835008 + }, + "model.layers.3.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 577724416, + "file_name": ".cache\\MatMulNBits_2_0_93.const", + "file_size": 229376 + }, + "model.layers.3.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 577953792, + "file_name": ".cache\\MatMulNBits_2_0_94.const", + "file_size": 57344 + }, + "model.layers.3.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 578011136, + "file_name": ".cache\\MatMulNBits_2_0_95.const", + "file_size": 29360128 + }, + "model.layers.3.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 607371264, + "file_name": ".cache\\MatMulNBits_2_0_96.const", + "file_size": 1835008 + }, + "model.layers.3.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 609206272, + "file_name": ".cache\\MatMulNBits_2_0_97.const", + "file_size": 229376 + }, + "model.layers.3.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 609435648, + "file_name": ".cache\\MatMulNBits_2_0_98.const", + "file_size": 57344 + }, + "model.layers.3.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 609492992, + "file_name": ".cache\\MatMulNBits_2_0_99.const", + "file_size": 58720256 + }, + "model.layers.3.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 668213248, + "file_name": ".cache\\MatMulNBits_2_0_100.const", + "file_size": 16384 + }, + "model.layers.3.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 668229632, + "file_name": ".cache\\MatMulNBits_2_0_101.const", + "file_size": 1835008 + }, + "model.layers.3.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 670064640, + "file_name": ".cache\\MatMulNBits_2_0_102.const", + "file_size": 458752 + }, + "model.layers.4.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 670523392, + "file_name": ".cache\\MatMulNBits_2_0_103.const", + "file_size": 8192 + }, + "model.layers.4.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 670531584, + "file_name": ".cache\\MatMulNBits_2_0_104.const", + "file_size": 20971520 + }, + "model.layers.4.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 691503104, + "file_name": ".cache\\MatMulNBits_2_0_105.const", + "file_size": 20480 + }, + "model.layers.4.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 691523584, + "file_name": ".cache\\MatMulNBits_2_0_106.const", + "file_size": 655360 + }, + "model.layers.4.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 692178944, + "file_name": ".cache\\MatMulNBits_2_0_107.const", + "file_size": 163840 + }, + "model.layers.4.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 692342784, + "file_name": ".cache\\MatMulNBits_2_0_108.const", + "file_size": 4194304 + }, + "model.layers.4.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 696537088, + "file_name": ".cache\\MatMulNBits_2_0_109.const", + "file_size": 4096 + }, + "model.layers.4.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 696541184, + "file_name": ".cache\\MatMulNBits_2_0_110.const", + "file_size": 131072 + }, + "model.layers.4.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 696672256, + "file_name": ".cache\\MatMulNBits_2_0_111.const", + "file_size": 32768 + }, + "model.layers.4.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 696705024, + "file_name": ".cache\\MatMulNBits_2_0_112.const", + "file_size": 16777216 + }, + "model.layers.4.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 713482240, + "file_name": ".cache\\MatMulNBits_2_0_113.const", + "file_size": 16384 + }, + "model.layers.4.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 713498624, + "file_name": ".cache\\MatMulNBits_2_0_114.const", + "file_size": 524288 + }, + "model.layers.4.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 714022912, + "file_name": ".cache\\MatMulNBits_2_0_115.const", + "file_size": 131072 + }, + "model.layers.4.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 714153984, + "file_name": ".cache\\MatMulNBits_2_0_116.const", + "file_size": 8192 + }, + "model.layers.4.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 714162176, + "file_name": ".cache\\MatMulNBits_2_0_117.const", + "file_size": 29360128 + }, + "model.layers.4.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 743522304, + "file_name": ".cache\\MatMulNBits_2_0_118.const", + "file_size": 1835008 + }, + "model.layers.4.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 745357312, + "file_name": ".cache\\MatMulNBits_2_0_119.const", + "file_size": 229376 + }, + "model.layers.4.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 745586688, + "file_name": ".cache\\MatMulNBits_2_0_120.const", + "file_size": 57344 + }, + "model.layers.4.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 745644032, + "file_name": ".cache\\MatMulNBits_2_0_121.const", + "file_size": 29360128 + }, + "model.layers.4.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 775004160, + "file_name": ".cache\\MatMulNBits_2_0_122.const", + "file_size": 1835008 + }, + "model.layers.4.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 776839168, + "file_name": ".cache\\MatMulNBits_2_0_123.const", + "file_size": 229376 + }, + "model.layers.4.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 777068544, + "file_name": ".cache\\MatMulNBits_2_0_124.const", + "file_size": 57344 + }, + "model.layers.4.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 777125888, + "file_name": ".cache\\MatMulNBits_2_0_125.const", + "file_size": 58720256 + }, + "model.layers.4.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 835846144, + "file_name": ".cache\\MatMulNBits_2_0_126.const", + "file_size": 16384 + }, + "model.layers.4.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 835862528, + "file_name": ".cache\\MatMulNBits_2_0_127.const", + "file_size": 1835008 + }, + "model.layers.4.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 837697536, + "file_name": ".cache\\MatMulNBits_2_0_128.const", + "file_size": 458752 + }, + "model.layers.5.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 838156288, + "file_name": ".cache\\MatMulNBits_2_0_129.const", + "file_size": 8192 + }, + "model.layers.5.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 838164480, + "file_name": ".cache\\MatMulNBits_2_0_130.const", + "file_size": 20971520 + }, + "model.layers.5.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 859136000, + "file_name": ".cache\\MatMulNBits_2_0_131.const", + "file_size": 20480 + }, + "model.layers.5.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 859156480, + "file_name": ".cache\\MatMulNBits_2_0_132.const", + "file_size": 655360 + }, + "model.layers.5.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 859811840, + "file_name": ".cache\\MatMulNBits_2_0_133.const", + "file_size": 163840 + }, + "model.layers.5.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 859975680, + "file_name": ".cache\\MatMulNBits_2_0_134.const", + "file_size": 4194304 + }, + "model.layers.5.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 864169984, + "file_name": ".cache\\MatMulNBits_2_0_135.const", + "file_size": 4096 + }, + "model.layers.5.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 864174080, + "file_name": ".cache\\MatMulNBits_2_0_136.const", + "file_size": 131072 + }, + "model.layers.5.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 864305152, + "file_name": ".cache\\MatMulNBits_2_0_137.const", + "file_size": 32768 + }, + "model.layers.5.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 864337920, + "file_name": ".cache\\MatMulNBits_2_0_138.const", + "file_size": 16777216 + }, + "model.layers.5.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 881115136, + "file_name": ".cache\\MatMulNBits_2_0_139.const", + "file_size": 16384 + }, + "model.layers.5.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 881131520, + "file_name": ".cache\\MatMulNBits_2_0_140.const", + "file_size": 524288 + }, + "model.layers.5.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 881655808, + "file_name": ".cache\\MatMulNBits_2_0_141.const", + "file_size": 131072 + }, + "model.layers.5.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 881786880, + "file_name": ".cache\\MatMulNBits_2_0_142.const", + "file_size": 8192 + }, + "model.layers.5.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 881795072, + "file_name": ".cache\\MatMulNBits_2_0_143.const", + "file_size": 29360128 + }, + "model.layers.5.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 911155200, + "file_name": ".cache\\MatMulNBits_2_0_144.const", + "file_size": 1835008 + }, + "model.layers.5.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 912990208, + "file_name": ".cache\\MatMulNBits_2_0_145.const", + "file_size": 229376 + }, + "model.layers.5.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 913219584, + "file_name": ".cache\\MatMulNBits_2_0_146.const", + "file_size": 57344 + }, + "model.layers.5.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 913276928, + "file_name": ".cache\\MatMulNBits_2_0_147.const", + "file_size": 29360128 + }, + "model.layers.5.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 942637056, + "file_name": ".cache\\MatMulNBits_2_0_148.const", + "file_size": 1835008 + }, + "model.layers.5.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 944472064, + "file_name": ".cache\\MatMulNBits_2_0_149.const", + "file_size": 229376 + }, + "model.layers.5.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 944701440, + "file_name": ".cache\\MatMulNBits_2_0_150.const", + "file_size": 57344 + }, + "model.layers.5.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 944758784, + "file_name": ".cache\\MatMulNBits_2_0_151.const", + "file_size": 58720256 + }, + "model.layers.5.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1003479040, + "file_name": ".cache\\MatMulNBits_2_0_152.const", + "file_size": 16384 + }, + "model.layers.5.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1003495424, + "file_name": ".cache\\MatMulNBits_2_0_153.const", + "file_size": 1835008 + }, + "model.layers.5.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1005330432, + "file_name": ".cache\\MatMulNBits_2_0_154.const", + "file_size": 458752 + }, + "model.layers.6.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1005789184, + "file_name": ".cache\\MatMulNBits_2_0_155.const", + "file_size": 8192 + }, + "model.layers.6.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1005797376, + "file_name": ".cache\\MatMulNBits_2_0_156.const", + "file_size": 20971520 + }, + "model.layers.6.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1026768896, + "file_name": ".cache\\MatMulNBits_2_0_157.const", + "file_size": 20480 + }, + "model.layers.6.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1026789376, + "file_name": ".cache\\MatMulNBits_2_0_158.const", + "file_size": 655360 + }, + "model.layers.6.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1027444736, + "file_name": ".cache\\MatMulNBits_2_0_159.const", + "file_size": 163840 + }, + "model.layers.6.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1027608576, + "file_name": ".cache\\MatMulNBits_2_0_160.const", + "file_size": 4194304 + }, + "model.layers.6.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1031802880, + "file_name": ".cache\\MatMulNBits_2_0_161.const", + "file_size": 4096 + }, + "model.layers.6.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1031806976, + "file_name": ".cache\\MatMulNBits_2_0_162.const", + "file_size": 131072 + }, + "model.layers.6.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1031938048, + "file_name": ".cache\\MatMulNBits_2_0_163.const", + "file_size": 32768 + }, + "model.layers.6.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1031970816, + "file_name": ".cache\\MatMulNBits_2_0_164.const", + "file_size": 16777216 + }, + "model.layers.6.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1048748032, + "file_name": ".cache\\MatMulNBits_2_0_165.const", + "file_size": 16384 + }, + "model.layers.6.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1048764416, + "file_name": ".cache\\MatMulNBits_2_0_166.const", + "file_size": 524288 + }, + "model.layers.6.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1049288704, + "file_name": ".cache\\MatMulNBits_2_0_167.const", + "file_size": 131072 + }, + "model.layers.6.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1049419776, + "file_name": ".cache\\MatMulNBits_2_0_168.const", + "file_size": 8192 + }, + "model.layers.6.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1049427968, + "file_name": ".cache\\MatMulNBits_2_0_169.const", + "file_size": 29360128 + }, + "model.layers.6.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1078788096, + "file_name": ".cache\\MatMulNBits_2_0_170.const", + "file_size": 1835008 + }, + "model.layers.6.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1080623104, + "file_name": ".cache\\MatMulNBits_2_0_171.const", + "file_size": 229376 + }, + "model.layers.6.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1080852480, + "file_name": ".cache\\MatMulNBits_2_0_172.const", + "file_size": 57344 + }, + "model.layers.6.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1080909824, + "file_name": ".cache\\MatMulNBits_2_0_173.const", + "file_size": 29360128 + }, + "model.layers.6.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1110269952, + "file_name": ".cache\\MatMulNBits_2_0_174.const", + "file_size": 1835008 + }, + "model.layers.6.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1112104960, + "file_name": ".cache\\MatMulNBits_2_0_175.const", + "file_size": 229376 + }, + "model.layers.6.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1112334336, + "file_name": ".cache\\MatMulNBits_2_0_176.const", + "file_size": 57344 + }, + "model.layers.6.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1112391680, + "file_name": ".cache\\MatMulNBits_2_0_177.const", + "file_size": 58720256 + }, + "model.layers.6.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1171111936, + "file_name": ".cache\\MatMulNBits_2_0_178.const", + "file_size": 16384 + }, + "model.layers.6.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1171128320, + "file_name": ".cache\\MatMulNBits_2_0_179.const", + "file_size": 1835008 + }, + "model.layers.6.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1172963328, + "file_name": ".cache\\MatMulNBits_2_0_180.const", + "file_size": 458752 + }, + "model.layers.7.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1173422080, + "file_name": ".cache\\MatMulNBits_2_0_181.const", + "file_size": 8192 + }, + "model.layers.7.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1173430272, + "file_name": ".cache\\MatMulNBits_2_0_182.const", + "file_size": 20971520 + }, + "model.layers.7.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1194401792, + "file_name": ".cache\\MatMulNBits_2_0_183.const", + "file_size": 20480 + }, + "model.layers.7.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1194422272, + "file_name": ".cache\\MatMulNBits_2_0_184.const", + "file_size": 655360 + }, + "model.layers.7.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1195077632, + "file_name": ".cache\\MatMulNBits_2_0_185.const", + "file_size": 163840 + }, + "model.layers.7.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1195241472, + "file_name": ".cache\\MatMulNBits_2_0_186.const", + "file_size": 4194304 + }, + "model.layers.7.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1199435776, + "file_name": ".cache\\MatMulNBits_2_0_187.const", + "file_size": 4096 + }, + "model.layers.7.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1199439872, + "file_name": ".cache\\MatMulNBits_2_0_188.const", + "file_size": 131072 + }, + "model.layers.7.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1199570944, + "file_name": ".cache\\MatMulNBits_2_0_189.const", + "file_size": 32768 + }, + "model.layers.7.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1199603712, + "file_name": ".cache\\MatMulNBits_2_0_190.const", + "file_size": 16777216 + }, + "model.layers.7.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1216380928, + "file_name": ".cache\\MatMulNBits_2_0_191.const", + "file_size": 16384 + }, + "model.layers.7.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1216397312, + "file_name": ".cache\\MatMulNBits_2_0_192.const", + "file_size": 524288 + }, + "model.layers.7.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1216921600, + "file_name": ".cache\\MatMulNBits_2_0_193.const", + "file_size": 131072 + }, + "model.layers.7.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1217052672, + "file_name": ".cache\\MatMulNBits_2_0_194.const", + "file_size": 8192 + }, + "model.layers.7.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1217060864, + "file_name": ".cache\\MatMulNBits_2_0_195.const", + "file_size": 29360128 + }, + "model.layers.7.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1246420992, + "file_name": ".cache\\MatMulNBits_2_0_196.const", + "file_size": 1835008 + }, + "model.layers.7.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1248256000, + "file_name": ".cache\\MatMulNBits_2_0_197.const", + "file_size": 229376 + }, + "model.layers.7.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1248485376, + "file_name": ".cache\\MatMulNBits_2_0_198.const", + "file_size": 57344 + }, + "model.layers.7.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1248542720, + "file_name": ".cache\\MatMulNBits_2_0_199.const", + "file_size": 29360128 + }, + "model.layers.7.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1277902848, + "file_name": ".cache\\MatMulNBits_2_0_200.const", + "file_size": 1835008 + }, + "model.layers.7.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1279737856, + "file_name": ".cache\\MatMulNBits_2_0_201.const", + "file_size": 229376 + }, + "model.layers.7.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1279967232, + "file_name": ".cache\\MatMulNBits_2_0_202.const", + "file_size": 57344 + }, + "model.layers.7.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1280024576, + "file_name": ".cache\\MatMulNBits_2_0_203.const", + "file_size": 58720256 + }, + "model.layers.7.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1338744832, + "file_name": ".cache\\MatMulNBits_2_0_204.const", + "file_size": 16384 + }, + "model.layers.7.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1338761216, + "file_name": ".cache\\MatMulNBits_2_0_205.const", + "file_size": 1835008 + }, + "model.layers.7.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1340596224, + "file_name": ".cache\\MatMulNBits_2_0_206.const", + "file_size": 458752 + }, + "model.layers.8.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1341054976, + "file_name": ".cache\\MatMulNBits_2_0_207.const", + "file_size": 8192 + }, + "model.layers.8.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1341063168, + "file_name": ".cache\\MatMulNBits_2_0_208.const", + "file_size": 20971520 + }, + "model.layers.8.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1362034688, + "file_name": ".cache\\MatMulNBits_2_0_209.const", + "file_size": 20480 + }, + "model.layers.8.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1362055168, + "file_name": ".cache\\MatMulNBits_2_0_210.const", + "file_size": 655360 + }, + "model.layers.8.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1362710528, + "file_name": ".cache\\MatMulNBits_2_0_211.const", + "file_size": 163840 + }, + "model.layers.8.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1362874368, + "file_name": ".cache\\MatMulNBits_2_0_212.const", + "file_size": 4194304 + }, + "model.layers.8.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1367068672, + "file_name": ".cache\\MatMulNBits_2_0_213.const", + "file_size": 4096 + }, + "model.layers.8.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1367072768, + "file_name": ".cache\\MatMulNBits_2_0_214.const", + "file_size": 131072 + }, + "model.layers.8.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1367203840, + "file_name": ".cache\\MatMulNBits_2_0_215.const", + "file_size": 32768 + }, + "model.layers.8.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1367236608, + "file_name": ".cache\\MatMulNBits_2_0_216.const", + "file_size": 16777216 + }, + "model.layers.8.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1384013824, + "file_name": ".cache\\MatMulNBits_2_0_217.const", + "file_size": 16384 + }, + "model.layers.8.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1384030208, + "file_name": ".cache\\MatMulNBits_2_0_218.const", + "file_size": 524288 + }, + "model.layers.8.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1384554496, + "file_name": ".cache\\MatMulNBits_2_0_219.const", + "file_size": 131072 + }, + "model.layers.8.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1384685568, + "file_name": ".cache\\MatMulNBits_2_0_220.const", + "file_size": 8192 + }, + "model.layers.8.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1384693760, + "file_name": ".cache\\MatMulNBits_2_0_221.const", + "file_size": 29360128 + }, + "model.layers.8.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1414053888, + "file_name": ".cache\\MatMulNBits_2_0_222.const", + "file_size": 1835008 + }, + "model.layers.8.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1415888896, + "file_name": ".cache\\MatMulNBits_2_0_223.const", + "file_size": 229376 + }, + "model.layers.8.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1416118272, + "file_name": ".cache\\MatMulNBits_2_0_224.const", + "file_size": 57344 + }, + "model.layers.8.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1416175616, + "file_name": ".cache\\MatMulNBits_2_0_225.const", + "file_size": 29360128 + }, + "model.layers.8.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1445535744, + "file_name": ".cache\\MatMulNBits_2_0_226.const", + "file_size": 1835008 + }, + "model.layers.8.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1447370752, + "file_name": ".cache\\MatMulNBits_2_0_227.const", + "file_size": 229376 + }, + "model.layers.8.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1447600128, + "file_name": ".cache\\MatMulNBits_2_0_228.const", + "file_size": 57344 + }, + "model.layers.8.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1447657472, + "file_name": ".cache\\MatMulNBits_2_0_229.const", + "file_size": 58720256 + }, + "model.layers.8.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1506377728, + "file_name": ".cache\\MatMulNBits_2_0_230.const", + "file_size": 16384 + }, + "model.layers.8.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1506394112, + "file_name": ".cache\\MatMulNBits_2_0_231.const", + "file_size": 1835008 + }, + "model.layers.8.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1508229120, + "file_name": ".cache\\MatMulNBits_2_0_232.const", + "file_size": 458752 + }, + "model.layers.9.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1508687872, + "file_name": ".cache\\MatMulNBits_2_0_233.const", + "file_size": 8192 + }, + "model.layers.9.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1508696064, + "file_name": ".cache\\MatMulNBits_2_0_234.const", + "file_size": 20971520 + }, + "model.layers.9.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1529667584, + "file_name": ".cache\\MatMulNBits_2_0_235.const", + "file_size": 20480 + }, + "model.layers.9.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1529688064, + "file_name": ".cache\\MatMulNBits_2_0_236.const", + "file_size": 655360 + }, + "model.layers.9.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1530343424, + "file_name": ".cache\\MatMulNBits_2_0_237.const", + "file_size": 163840 + }, + "model.layers.9.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1530507264, + "file_name": ".cache\\MatMulNBits_2_0_238.const", + "file_size": 4194304 + }, + "model.layers.9.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1534701568, + "file_name": ".cache\\MatMulNBits_2_0_239.const", + "file_size": 4096 + }, + "model.layers.9.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1534705664, + "file_name": ".cache\\MatMulNBits_2_0_240.const", + "file_size": 131072 + }, + "model.layers.9.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1534836736, + "file_name": ".cache\\MatMulNBits_2_0_241.const", + "file_size": 32768 + }, + "model.layers.9.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1534869504, + "file_name": ".cache\\MatMulNBits_2_0_242.const", + "file_size": 16777216 + }, + "model.layers.9.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1551646720, + "file_name": ".cache\\MatMulNBits_2_0_243.const", + "file_size": 16384 + }, + "model.layers.9.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1551663104, + "file_name": ".cache\\MatMulNBits_2_0_244.const", + "file_size": 524288 + }, + "model.layers.9.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1552187392, + "file_name": ".cache\\MatMulNBits_2_0_245.const", + "file_size": 131072 + }, + "model.layers.9.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1552318464, + "file_name": ".cache\\MatMulNBits_2_0_246.const", + "file_size": 8192 + }, + "model.layers.9.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1552326656, + "file_name": ".cache\\MatMulNBits_2_0_247.const", + "file_size": 29360128 + }, + "model.layers.9.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1581686784, + "file_name": ".cache\\MatMulNBits_2_0_248.const", + "file_size": 1835008 + }, + "model.layers.9.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1583521792, + "file_name": ".cache\\MatMulNBits_2_0_249.const", + "file_size": 229376 + }, + "model.layers.9.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1583751168, + "file_name": ".cache\\MatMulNBits_2_0_250.const", + "file_size": 57344 + }, + "model.layers.9.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1583808512, + "file_name": ".cache\\MatMulNBits_2_0_251.const", + "file_size": 29360128 + }, + "model.layers.9.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1613168640, + "file_name": ".cache\\MatMulNBits_2_0_252.const", + "file_size": 1835008 + }, + "model.layers.9.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1615003648, + "file_name": ".cache\\MatMulNBits_2_0_253.const", + "file_size": 229376 + }, + "model.layers.9.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1615233024, + "file_name": ".cache\\MatMulNBits_2_0_254.const", + "file_size": 57344 + }, + "model.layers.9.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1615290368, + "file_name": ".cache\\MatMulNBits_2_0_255.const", + "file_size": 58720256 + }, + "model.layers.9.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1674010624, + "file_name": ".cache\\MatMulNBits_2_0_256.const", + "file_size": 16384 + }, + "model.layers.9.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1674027008, + "file_name": ".cache\\MatMulNBits_2_0_257.const", + "file_size": 1835008 + }, + "model.layers.9.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1675862016, + "file_name": ".cache\\MatMulNBits_2_0_258.const", + "file_size": 458752 + }, + "model.layers.10.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1676320768, + "file_name": ".cache\\MatMulNBits_2_0_259.const", + "file_size": 8192 + }, + "model.layers.10.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1676328960, + "file_name": ".cache\\MatMulNBits_2_0_260.const", + "file_size": 20971520 + }, + "model.layers.10.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1697300480, + "file_name": ".cache\\MatMulNBits_2_0_261.const", + "file_size": 20480 + }, + "model.layers.10.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1697320960, + "file_name": ".cache\\MatMulNBits_2_0_262.const", + "file_size": 655360 + }, + "model.layers.10.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1697976320, + "file_name": ".cache\\MatMulNBits_2_0_263.const", + "file_size": 163840 + }, + "model.layers.10.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1698140160, + "file_name": ".cache\\MatMulNBits_2_0_264.const", + "file_size": 4194304 + }, + "model.layers.10.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1702334464, + "file_name": ".cache\\MatMulNBits_2_0_265.const", + "file_size": 4096 + }, + "model.layers.10.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1702338560, + "file_name": ".cache\\MatMulNBits_2_0_266.const", + "file_size": 131072 + }, + "model.layers.10.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1702469632, + "file_name": ".cache\\MatMulNBits_2_0_267.const", + "file_size": 32768 + }, + "model.layers.10.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1702502400, + "file_name": ".cache\\MatMulNBits_2_0_268.const", + "file_size": 16777216 + }, + "model.layers.10.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1719279616, + "file_name": ".cache\\MatMulNBits_2_0_269.const", + "file_size": 16384 + }, + "model.layers.10.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1719296000, + "file_name": ".cache\\MatMulNBits_2_0_270.const", + "file_size": 524288 + }, + "model.layers.10.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1719820288, + "file_name": ".cache\\MatMulNBits_2_0_271.const", + "file_size": 131072 + }, + "model.layers.10.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1719951360, + "file_name": ".cache\\MatMulNBits_2_0_272.const", + "file_size": 8192 + }, + "model.layers.10.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1719959552, + "file_name": ".cache\\MatMulNBits_2_0_273.const", + "file_size": 29360128 + }, + "model.layers.10.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1749319680, + "file_name": ".cache\\MatMulNBits_2_0_274.const", + "file_size": 1835008 + }, + "model.layers.10.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1751154688, + "file_name": ".cache\\MatMulNBits_2_0_275.const", + "file_size": 229376 + }, + "model.layers.10.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1751384064, + "file_name": ".cache\\MatMulNBits_2_0_276.const", + "file_size": 57344 + }, + "model.layers.10.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1751441408, + "file_name": ".cache\\MatMulNBits_2_0_277.const", + "file_size": 29360128 + }, + "model.layers.10.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1780801536, + "file_name": ".cache\\MatMulNBits_2_0_278.const", + "file_size": 1835008 + }, + "model.layers.10.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1782636544, + "file_name": ".cache\\MatMulNBits_2_0_279.const", + "file_size": 229376 + }, + "model.layers.10.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1782865920, + "file_name": ".cache\\MatMulNBits_2_0_280.const", + "file_size": 57344 + }, + "model.layers.10.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1782923264, + "file_name": ".cache\\MatMulNBits_2_0_281.const", + "file_size": 58720256 + }, + "model.layers.10.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1841643520, + "file_name": ".cache\\MatMulNBits_2_0_282.const", + "file_size": 16384 + }, + "model.layers.10.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1841659904, + "file_name": ".cache\\MatMulNBits_2_0_283.const", + "file_size": 1835008 + }, + "model.layers.10.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 1843494912, + "file_name": ".cache\\MatMulNBits_2_0_284.const", + "file_size": 458752 + }, + "model.layers.11.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1843953664, + "file_name": ".cache\\MatMulNBits_2_0_285.const", + "file_size": 8192 + }, + "model.layers.11.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 1843961856, + "file_name": ".cache\\MatMulNBits_2_0_286.const", + "file_size": 20971520 + }, + "model.layers.11.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 1864933376, + "file_name": ".cache\\MatMulNBits_2_0_287.const", + "file_size": 20480 + }, + "model.layers.11.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 1864953856, + "file_name": ".cache\\MatMulNBits_2_0_288.const", + "file_size": 655360 + }, + "model.layers.11.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 1865609216, + "file_name": ".cache\\MatMulNBits_2_0_289.const", + "file_size": 163840 + }, + "model.layers.11.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 1865773056, + "file_name": ".cache\\MatMulNBits_2_0_290.const", + "file_size": 4194304 + }, + "model.layers.11.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 1869967360, + "file_name": ".cache\\MatMulNBits_2_0_291.const", + "file_size": 4096 + }, + "model.layers.11.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1869971456, + "file_name": ".cache\\MatMulNBits_2_0_292.const", + "file_size": 131072 + }, + "model.layers.11.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 1870102528, + "file_name": ".cache\\MatMulNBits_2_0_293.const", + "file_size": 32768 + }, + "model.layers.11.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 1870135296, + "file_name": ".cache\\MatMulNBits_2_0_294.const", + "file_size": 16777216 + }, + "model.layers.11.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 1886912512, + "file_name": ".cache\\MatMulNBits_2_0_295.const", + "file_size": 16384 + }, + "model.layers.11.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 1886928896, + "file_name": ".cache\\MatMulNBits_2_0_296.const", + "file_size": 524288 + }, + "model.layers.11.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 1887453184, + "file_name": ".cache\\MatMulNBits_2_0_297.const", + "file_size": 131072 + }, + "model.layers.11.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 1887584256, + "file_name": ".cache\\MatMulNBits_2_0_298.const", + "file_size": 8192 + }, + "model.layers.11.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1887592448, + "file_name": ".cache\\MatMulNBits_2_0_299.const", + "file_size": 29360128 + }, + "model.layers.11.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1916952576, + "file_name": ".cache\\MatMulNBits_2_0_300.const", + "file_size": 1835008 + }, + "model.layers.11.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1918787584, + "file_name": ".cache\\MatMulNBits_2_0_301.const", + "file_size": 229376 + }, + "model.layers.11.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1919016960, + "file_name": ".cache\\MatMulNBits_2_0_302.const", + "file_size": 57344 + }, + "model.layers.11.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 1919074304, + "file_name": ".cache\\MatMulNBits_2_0_303.const", + "file_size": 29360128 + }, + "model.layers.11.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 1948434432, + "file_name": ".cache\\MatMulNBits_2_0_304.const", + "file_size": 1835008 + }, + "model.layers.11.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 1950269440, + "file_name": ".cache\\MatMulNBits_2_0_305.const", + "file_size": 229376 + }, + "model.layers.11.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 1950498816, + "file_name": ".cache\\MatMulNBits_2_0_306.const", + "file_size": 57344 + }, + "model.layers.11.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 1950556160, + "file_name": ".cache\\MatMulNBits_2_0_307.const", + "file_size": 58720256 + }, + "model.layers.11.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2009276416, + "file_name": ".cache\\MatMulNBits_2_0_308.const", + "file_size": 16384 + }, + "model.layers.11.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2009292800, + "file_name": ".cache\\MatMulNBits_2_0_309.const", + "file_size": 1835008 + }, + "model.layers.11.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2011127808, + "file_name": ".cache\\MatMulNBits_2_0_310.const", + "file_size": 458752 + }, + "model.layers.12.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2011586560, + "file_name": ".cache\\MatMulNBits_2_0_311.const", + "file_size": 8192 + }, + "model.layers.12.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2011594752, + "file_name": ".cache\\MatMulNBits_2_0_312.const", + "file_size": 20971520 + }, + "model.layers.12.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2032566272, + "file_name": ".cache\\MatMulNBits_2_0_313.const", + "file_size": 20480 + }, + "model.layers.12.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2032586752, + "file_name": ".cache\\MatMulNBits_2_0_314.const", + "file_size": 655360 + }, + "model.layers.12.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2033242112, + "file_name": ".cache\\MatMulNBits_2_0_315.const", + "file_size": 163840 + }, + "model.layers.12.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2033405952, + "file_name": ".cache\\MatMulNBits_2_0_316.const", + "file_size": 4194304 + }, + "model.layers.12.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2037600256, + "file_name": ".cache\\MatMulNBits_2_0_317.const", + "file_size": 4096 + }, + "model.layers.12.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2037604352, + "file_name": ".cache\\MatMulNBits_2_0_318.const", + "file_size": 131072 + }, + "model.layers.12.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2037735424, + "file_name": ".cache\\MatMulNBits_2_0_319.const", + "file_size": 32768 + }, + "model.layers.12.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2037768192, + "file_name": ".cache\\MatMulNBits_2_0_320.const", + "file_size": 16777216 + }, + "model.layers.12.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2054545408, + "file_name": ".cache\\MatMulNBits_2_0_321.const", + "file_size": 16384 + }, + "model.layers.12.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2054561792, + "file_name": ".cache\\MatMulNBits_2_0_322.const", + "file_size": 524288 + }, + "model.layers.12.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2055086080, + "file_name": ".cache\\MatMulNBits_2_0_323.const", + "file_size": 131072 + }, + "model.layers.12.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2055217152, + "file_name": ".cache\\MatMulNBits_2_0_324.const", + "file_size": 8192 + }, + "model.layers.12.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2055225344, + "file_name": ".cache\\MatMulNBits_2_0_325.const", + "file_size": 29360128 + }, + "model.layers.12.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2084585472, + "file_name": ".cache\\MatMulNBits_2_0_326.const", + "file_size": 1835008 + }, + "model.layers.12.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2086420480, + "file_name": ".cache\\MatMulNBits_2_0_327.const", + "file_size": 229376 + }, + "model.layers.12.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2086649856, + "file_name": ".cache\\MatMulNBits_2_0_328.const", + "file_size": 57344 + }, + "model.layers.12.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2086707200, + "file_name": ".cache\\MatMulNBits_2_0_329.const", + "file_size": 29360128 + }, + "model.layers.12.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2116067328, + "file_name": ".cache\\MatMulNBits_2_0_330.const", + "file_size": 1835008 + }, + "model.layers.12.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2117902336, + "file_name": ".cache\\MatMulNBits_2_0_331.const", + "file_size": 229376 + }, + "model.layers.12.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2118131712, + "file_name": ".cache\\MatMulNBits_2_0_332.const", + "file_size": 57344 + }, + "model.layers.12.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2118189056, + "file_name": ".cache\\MatMulNBits_2_0_333.const", + "file_size": 58720256 + }, + "model.layers.12.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2176909312, + "file_name": ".cache\\MatMulNBits_2_0_334.const", + "file_size": 16384 + }, + "model.layers.12.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2176925696, + "file_name": ".cache\\MatMulNBits_2_0_335.const", + "file_size": 1835008 + }, + "model.layers.12.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2178760704, + "file_name": ".cache\\MatMulNBits_2_0_336.const", + "file_size": 458752 + }, + "model.layers.13.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2179219456, + "file_name": ".cache\\MatMulNBits_2_0_337.const", + "file_size": 8192 + }, + "model.layers.13.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2179227648, + "file_name": ".cache\\MatMulNBits_2_0_338.const", + "file_size": 20971520 + }, + "model.layers.13.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2200199168, + "file_name": ".cache\\MatMulNBits_2_0_339.const", + "file_size": 20480 + }, + "model.layers.13.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2200219648, + "file_name": ".cache\\MatMulNBits_2_0_340.const", + "file_size": 655360 + }, + "model.layers.13.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2200875008, + "file_name": ".cache\\MatMulNBits_2_0_341.const", + "file_size": 163840 + }, + "model.layers.13.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2201038848, + "file_name": ".cache\\MatMulNBits_2_0_342.const", + "file_size": 4194304 + }, + "model.layers.13.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2205233152, + "file_name": ".cache\\MatMulNBits_2_0_343.const", + "file_size": 4096 + }, + "model.layers.13.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2205237248, + "file_name": ".cache\\MatMulNBits_2_0_344.const", + "file_size": 131072 + }, + "model.layers.13.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2205368320, + "file_name": ".cache\\MatMulNBits_2_0_345.const", + "file_size": 32768 + }, + "model.layers.13.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2205401088, + "file_name": ".cache\\MatMulNBits_2_0_346.const", + "file_size": 16777216 + }, + "model.layers.13.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2222178304, + "file_name": ".cache\\MatMulNBits_2_0_347.const", + "file_size": 16384 + }, + "model.layers.13.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2222194688, + "file_name": ".cache\\MatMulNBits_2_0_348.const", + "file_size": 524288 + }, + "model.layers.13.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2222718976, + "file_name": ".cache\\MatMulNBits_2_0_349.const", + "file_size": 131072 + }, + "model.layers.13.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2222850048, + "file_name": ".cache\\MatMulNBits_2_0_350.const", + "file_size": 8192 + }, + "model.layers.13.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2222858240, + "file_name": ".cache\\MatMulNBits_2_0_351.const", + "file_size": 29360128 + }, + "model.layers.13.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2252218368, + "file_name": ".cache\\MatMulNBits_2_0_352.const", + "file_size": 1835008 + }, + "model.layers.13.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2254053376, + "file_name": ".cache\\MatMulNBits_2_0_353.const", + "file_size": 229376 + }, + "model.layers.13.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2254282752, + "file_name": ".cache\\MatMulNBits_2_0_354.const", + "file_size": 57344 + }, + "model.layers.13.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2254340096, + "file_name": ".cache\\MatMulNBits_2_0_355.const", + "file_size": 29360128 + }, + "model.layers.13.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2283700224, + "file_name": ".cache\\MatMulNBits_2_0_356.const", + "file_size": 1835008 + }, + "model.layers.13.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2285535232, + "file_name": ".cache\\MatMulNBits_2_0_357.const", + "file_size": 229376 + }, + "model.layers.13.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2285764608, + "file_name": ".cache\\MatMulNBits_2_0_358.const", + "file_size": 57344 + }, + "model.layers.13.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2285821952, + "file_name": ".cache\\MatMulNBits_2_0_359.const", + "file_size": 58720256 + }, + "model.layers.13.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2344542208, + "file_name": ".cache\\MatMulNBits_2_0_360.const", + "file_size": 16384 + }, + "model.layers.13.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2344558592, + "file_name": ".cache\\MatMulNBits_2_0_361.const", + "file_size": 1835008 + }, + "model.layers.13.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2346393600, + "file_name": ".cache\\MatMulNBits_2_0_362.const", + "file_size": 458752 + }, + "model.layers.14.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2346852352, + "file_name": ".cache\\MatMulNBits_2_0_363.const", + "file_size": 8192 + }, + "model.layers.14.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2346860544, + "file_name": ".cache\\MatMulNBits_2_0_364.const", + "file_size": 20971520 + }, + "model.layers.14.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2367832064, + "file_name": ".cache\\MatMulNBits_2_0_365.const", + "file_size": 20480 + }, + "model.layers.14.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2367852544, + "file_name": ".cache\\MatMulNBits_2_0_366.const", + "file_size": 655360 + }, + "model.layers.14.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2368507904, + "file_name": ".cache\\MatMulNBits_2_0_367.const", + "file_size": 163840 + }, + "model.layers.14.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2368671744, + "file_name": ".cache\\MatMulNBits_2_0_368.const", + "file_size": 4194304 + }, + "model.layers.14.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2372866048, + "file_name": ".cache\\MatMulNBits_2_0_369.const", + "file_size": 4096 + }, + "model.layers.14.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2372870144, + "file_name": ".cache\\MatMulNBits_2_0_370.const", + "file_size": 131072 + }, + "model.layers.14.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2373001216, + "file_name": ".cache\\MatMulNBits_2_0_371.const", + "file_size": 32768 + }, + "model.layers.14.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2373033984, + "file_name": ".cache\\MatMulNBits_2_0_372.const", + "file_size": 16777216 + }, + "model.layers.14.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2389811200, + "file_name": ".cache\\MatMulNBits_2_0_373.const", + "file_size": 16384 + }, + "model.layers.14.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2389827584, + "file_name": ".cache\\MatMulNBits_2_0_374.const", + "file_size": 524288 + }, + "model.layers.14.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2390351872, + "file_name": ".cache\\MatMulNBits_2_0_375.const", + "file_size": 131072 + }, + "model.layers.14.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2390482944, + "file_name": ".cache\\MatMulNBits_2_0_376.const", + "file_size": 8192 + }, + "model.layers.14.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2390491136, + "file_name": ".cache\\MatMulNBits_2_0_377.const", + "file_size": 29360128 + }, + "model.layers.14.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2419851264, + "file_name": ".cache\\MatMulNBits_2_0_378.const", + "file_size": 1835008 + }, + "model.layers.14.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2421686272, + "file_name": ".cache\\MatMulNBits_2_0_379.const", + "file_size": 229376 + }, + "model.layers.14.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2421915648, + "file_name": ".cache\\MatMulNBits_2_0_380.const", + "file_size": 57344 + }, + "model.layers.14.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2421972992, + "file_name": ".cache\\MatMulNBits_2_0_381.const", + "file_size": 29360128 + }, + "model.layers.14.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2451333120, + "file_name": ".cache\\MatMulNBits_2_0_382.const", + "file_size": 1835008 + }, + "model.layers.14.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2453168128, + "file_name": ".cache\\MatMulNBits_2_0_383.const", + "file_size": 229376 + }, + "model.layers.14.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2453397504, + "file_name": ".cache\\MatMulNBits_2_0_384.const", + "file_size": 57344 + }, + "model.layers.14.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2453454848, + "file_name": ".cache\\MatMulNBits_2_0_385.const", + "file_size": 58720256 + }, + "model.layers.14.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2512175104, + "file_name": ".cache\\MatMulNBits_2_0_386.const", + "file_size": 16384 + }, + "model.layers.14.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2512191488, + "file_name": ".cache\\MatMulNBits_2_0_387.const", + "file_size": 1835008 + }, + "model.layers.14.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2514026496, + "file_name": ".cache\\MatMulNBits_2_0_388.const", + "file_size": 458752 + }, + "model.layers.15.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2514485248, + "file_name": ".cache\\MatMulNBits_2_0_389.const", + "file_size": 8192 + }, + "model.layers.15.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2514493440, + "file_name": ".cache\\MatMulNBits_2_0_390.const", + "file_size": 20971520 + }, + "model.layers.15.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2535464960, + "file_name": ".cache\\MatMulNBits_2_0_391.const", + "file_size": 20480 + }, + "model.layers.15.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2535485440, + "file_name": ".cache\\MatMulNBits_2_0_392.const", + "file_size": 655360 + }, + "model.layers.15.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2536140800, + "file_name": ".cache\\MatMulNBits_2_0_393.const", + "file_size": 163840 + }, + "model.layers.15.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2536304640, + "file_name": ".cache\\MatMulNBits_2_0_394.const", + "file_size": 4194304 + }, + "model.layers.15.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2540498944, + "file_name": ".cache\\MatMulNBits_2_0_395.const", + "file_size": 4096 + }, + "model.layers.15.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2540503040, + "file_name": ".cache\\MatMulNBits_2_0_396.const", + "file_size": 131072 + }, + "model.layers.15.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2540634112, + "file_name": ".cache\\MatMulNBits_2_0_397.const", + "file_size": 32768 + }, + "model.layers.15.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2540666880, + "file_name": ".cache\\MatMulNBits_2_0_398.const", + "file_size": 16777216 + }, + "model.layers.15.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2557444096, + "file_name": ".cache\\MatMulNBits_2_0_399.const", + "file_size": 16384 + }, + "model.layers.15.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2557460480, + "file_name": ".cache\\MatMulNBits_2_0_400.const", + "file_size": 524288 + }, + "model.layers.15.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2557984768, + "file_name": ".cache\\MatMulNBits_2_0_401.const", + "file_size": 131072 + }, + "model.layers.15.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2558115840, + "file_name": ".cache\\MatMulNBits_2_0_402.const", + "file_size": 8192 + }, + "model.layers.15.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2558124032, + "file_name": ".cache\\MatMulNBits_2_0_403.const", + "file_size": 29360128 + }, + "model.layers.15.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2587484160, + "file_name": ".cache\\MatMulNBits_2_0_404.const", + "file_size": 1835008 + }, + "model.layers.15.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2589319168, + "file_name": ".cache\\MatMulNBits_2_0_405.const", + "file_size": 229376 + }, + "model.layers.15.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2589548544, + "file_name": ".cache\\MatMulNBits_2_0_406.const", + "file_size": 57344 + }, + "model.layers.15.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2589605888, + "file_name": ".cache\\MatMulNBits_2_0_407.const", + "file_size": 29360128 + }, + "model.layers.15.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2618966016, + "file_name": ".cache\\MatMulNBits_2_0_408.const", + "file_size": 1835008 + }, + "model.layers.15.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2620801024, + "file_name": ".cache\\MatMulNBits_2_0_409.const", + "file_size": 229376 + }, + "model.layers.15.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2621030400, + "file_name": ".cache\\MatMulNBits_2_0_410.const", + "file_size": 57344 + }, + "model.layers.15.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2621087744, + "file_name": ".cache\\MatMulNBits_2_0_411.const", + "file_size": 58720256 + }, + "model.layers.15.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2679808000, + "file_name": ".cache\\MatMulNBits_2_0_412.const", + "file_size": 16384 + }, + "model.layers.15.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2679824384, + "file_name": ".cache\\MatMulNBits_2_0_413.const", + "file_size": 1835008 + }, + "model.layers.15.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2681659392, + "file_name": ".cache\\MatMulNBits_2_0_414.const", + "file_size": 458752 + }, + "model.layers.16.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2682118144, + "file_name": ".cache\\MatMulNBits_2_0_415.const", + "file_size": 8192 + }, + "model.layers.16.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2682126336, + "file_name": ".cache\\MatMulNBits_2_0_416.const", + "file_size": 20971520 + }, + "model.layers.16.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2703097856, + "file_name": ".cache\\MatMulNBits_2_0_417.const", + "file_size": 20480 + }, + "model.layers.16.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2703118336, + "file_name": ".cache\\MatMulNBits_2_0_418.const", + "file_size": 655360 + }, + "model.layers.16.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2703773696, + "file_name": ".cache\\MatMulNBits_2_0_419.const", + "file_size": 163840 + }, + "model.layers.16.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2703937536, + "file_name": ".cache\\MatMulNBits_2_0_420.const", + "file_size": 4194304 + }, + "model.layers.16.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2708131840, + "file_name": ".cache\\MatMulNBits_2_0_421.const", + "file_size": 4096 + }, + "model.layers.16.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2708135936, + "file_name": ".cache\\MatMulNBits_2_0_422.const", + "file_size": 131072 + }, + "model.layers.16.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2708267008, + "file_name": ".cache\\MatMulNBits_2_0_423.const", + "file_size": 32768 + }, + "model.layers.16.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2708299776, + "file_name": ".cache\\MatMulNBits_2_0_424.const", + "file_size": 16777216 + }, + "model.layers.16.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2725076992, + "file_name": ".cache\\MatMulNBits_2_0_425.const", + "file_size": 16384 + }, + "model.layers.16.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2725093376, + "file_name": ".cache\\MatMulNBits_2_0_426.const", + "file_size": 524288 + }, + "model.layers.16.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2725617664, + "file_name": ".cache\\MatMulNBits_2_0_427.const", + "file_size": 131072 + }, + "model.layers.16.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2725748736, + "file_name": ".cache\\MatMulNBits_2_0_428.const", + "file_size": 8192 + }, + "model.layers.16.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2725756928, + "file_name": ".cache\\MatMulNBits_2_0_429.const", + "file_size": 29360128 + }, + "model.layers.16.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2755117056, + "file_name": ".cache\\MatMulNBits_2_0_430.const", + "file_size": 1835008 + }, + "model.layers.16.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2756952064, + "file_name": ".cache\\MatMulNBits_2_0_431.const", + "file_size": 229376 + }, + "model.layers.16.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2757181440, + "file_name": ".cache\\MatMulNBits_2_0_432.const", + "file_size": 57344 + }, + "model.layers.16.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2757238784, + "file_name": ".cache\\MatMulNBits_2_0_433.const", + "file_size": 29360128 + }, + "model.layers.16.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2786598912, + "file_name": ".cache\\MatMulNBits_2_0_434.const", + "file_size": 1835008 + }, + "model.layers.16.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2788433920, + "file_name": ".cache\\MatMulNBits_2_0_435.const", + "file_size": 229376 + }, + "model.layers.16.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2788663296, + "file_name": ".cache\\MatMulNBits_2_0_436.const", + "file_size": 57344 + }, + "model.layers.16.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2788720640, + "file_name": ".cache\\MatMulNBits_2_0_437.const", + "file_size": 58720256 + }, + "model.layers.16.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2847440896, + "file_name": ".cache\\MatMulNBits_2_0_438.const", + "file_size": 16384 + }, + "model.layers.16.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2847457280, + "file_name": ".cache\\MatMulNBits_2_0_439.const", + "file_size": 1835008 + }, + "model.layers.16.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 2849292288, + "file_name": ".cache\\MatMulNBits_2_0_440.const", + "file_size": 458752 + }, + "model.layers.17.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2849751040, + "file_name": ".cache\\MatMulNBits_2_0_441.const", + "file_size": 8192 + }, + "model.layers.17.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 2849759232, + "file_name": ".cache\\MatMulNBits_2_0_442.const", + "file_size": 20971520 + }, + "model.layers.17.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 2870730752, + "file_name": ".cache\\MatMulNBits_2_0_443.const", + "file_size": 20480 + }, + "model.layers.17.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 2870751232, + "file_name": ".cache\\MatMulNBits_2_0_444.const", + "file_size": 655360 + }, + "model.layers.17.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 2871406592, + "file_name": ".cache\\MatMulNBits_2_0_445.const", + "file_size": 163840 + }, + "model.layers.17.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 2871570432, + "file_name": ".cache\\MatMulNBits_2_0_446.const", + "file_size": 4194304 + }, + "model.layers.17.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 2875764736, + "file_name": ".cache\\MatMulNBits_2_0_447.const", + "file_size": 4096 + }, + "model.layers.17.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2875768832, + "file_name": ".cache\\MatMulNBits_2_0_448.const", + "file_size": 131072 + }, + "model.layers.17.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 2875899904, + "file_name": ".cache\\MatMulNBits_2_0_449.const", + "file_size": 32768 + }, + "model.layers.17.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 2875932672, + "file_name": ".cache\\MatMulNBits_2_0_450.const", + "file_size": 16777216 + }, + "model.layers.17.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 2892709888, + "file_name": ".cache\\MatMulNBits_2_0_451.const", + "file_size": 16384 + }, + "model.layers.17.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 2892726272, + "file_name": ".cache\\MatMulNBits_2_0_452.const", + "file_size": 524288 + }, + "model.layers.17.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 2893250560, + "file_name": ".cache\\MatMulNBits_2_0_453.const", + "file_size": 131072 + }, + "model.layers.17.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 2893381632, + "file_name": ".cache\\MatMulNBits_2_0_454.const", + "file_size": 8192 + }, + "model.layers.17.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2893389824, + "file_name": ".cache\\MatMulNBits_2_0_455.const", + "file_size": 29360128 + }, + "model.layers.17.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2922749952, + "file_name": ".cache\\MatMulNBits_2_0_456.const", + "file_size": 1835008 + }, + "model.layers.17.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2924584960, + "file_name": ".cache\\MatMulNBits_2_0_457.const", + "file_size": 229376 + }, + "model.layers.17.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2924814336, + "file_name": ".cache\\MatMulNBits_2_0_458.const", + "file_size": 57344 + }, + "model.layers.17.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 2924871680, + "file_name": ".cache\\MatMulNBits_2_0_459.const", + "file_size": 29360128 + }, + "model.layers.17.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 2954231808, + "file_name": ".cache\\MatMulNBits_2_0_460.const", + "file_size": 1835008 + }, + "model.layers.17.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 2956066816, + "file_name": ".cache\\MatMulNBits_2_0_461.const", + "file_size": 229376 + }, + "model.layers.17.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 2956296192, + "file_name": ".cache\\MatMulNBits_2_0_462.const", + "file_size": 57344 + }, + "model.layers.17.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 2956353536, + "file_name": ".cache\\MatMulNBits_2_0_463.const", + "file_size": 58720256 + }, + "model.layers.17.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3015073792, + "file_name": ".cache\\MatMulNBits_2_0_464.const", + "file_size": 16384 + }, + "model.layers.17.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3015090176, + "file_name": ".cache\\MatMulNBits_2_0_465.const", + "file_size": 1835008 + }, + "model.layers.17.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3016925184, + "file_name": ".cache\\MatMulNBits_2_0_466.const", + "file_size": 458752 + }, + "model.layers.18.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3017383936, + "file_name": ".cache\\MatMulNBits_2_0_467.const", + "file_size": 8192 + }, + "model.layers.18.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3017392128, + "file_name": ".cache\\MatMulNBits_2_0_468.const", + "file_size": 20971520 + }, + "model.layers.18.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3038363648, + "file_name": ".cache\\MatMulNBits_2_0_469.const", + "file_size": 20480 + }, + "model.layers.18.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3038384128, + "file_name": ".cache\\MatMulNBits_2_0_470.const", + "file_size": 655360 + }, + "model.layers.18.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3039039488, + "file_name": ".cache\\MatMulNBits_2_0_471.const", + "file_size": 163840 + }, + "model.layers.18.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3039203328, + "file_name": ".cache\\MatMulNBits_2_0_472.const", + "file_size": 4194304 + }, + "model.layers.18.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3043397632, + "file_name": ".cache\\MatMulNBits_2_0_473.const", + "file_size": 4096 + }, + "model.layers.18.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3043401728, + "file_name": ".cache\\MatMulNBits_2_0_474.const", + "file_size": 131072 + }, + "model.layers.18.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3043532800, + "file_name": ".cache\\MatMulNBits_2_0_475.const", + "file_size": 32768 + }, + "model.layers.18.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3043565568, + "file_name": ".cache\\MatMulNBits_2_0_476.const", + "file_size": 16777216 + }, + "model.layers.18.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3060342784, + "file_name": ".cache\\MatMulNBits_2_0_477.const", + "file_size": 16384 + }, + "model.layers.18.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3060359168, + "file_name": ".cache\\MatMulNBits_2_0_478.const", + "file_size": 524288 + }, + "model.layers.18.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3060883456, + "file_name": ".cache\\MatMulNBits_2_0_479.const", + "file_size": 131072 + }, + "model.layers.18.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3061014528, + "file_name": ".cache\\MatMulNBits_2_0_480.const", + "file_size": 8192 + }, + "model.layers.18.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3061022720, + "file_name": ".cache\\MatMulNBits_2_0_481.const", + "file_size": 29360128 + }, + "model.layers.18.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3090382848, + "file_name": ".cache\\MatMulNBits_2_0_482.const", + "file_size": 1835008 + }, + "model.layers.18.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3092217856, + "file_name": ".cache\\MatMulNBits_2_0_483.const", + "file_size": 229376 + }, + "model.layers.18.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3092447232, + "file_name": ".cache\\MatMulNBits_2_0_484.const", + "file_size": 57344 + }, + "model.layers.18.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3092504576, + "file_name": ".cache\\MatMulNBits_2_0_485.const", + "file_size": 29360128 + }, + "model.layers.18.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3121864704, + "file_name": ".cache\\MatMulNBits_2_0_486.const", + "file_size": 1835008 + }, + "model.layers.18.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3123699712, + "file_name": ".cache\\MatMulNBits_2_0_487.const", + "file_size": 229376 + }, + "model.layers.18.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3123929088, + "file_name": ".cache\\MatMulNBits_2_0_488.const", + "file_size": 57344 + }, + "model.layers.18.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3123986432, + "file_name": ".cache\\MatMulNBits_2_0_489.const", + "file_size": 58720256 + }, + "model.layers.18.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3182706688, + "file_name": ".cache\\MatMulNBits_2_0_490.const", + "file_size": 16384 + }, + "model.layers.18.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3182723072, + "file_name": ".cache\\MatMulNBits_2_0_491.const", + "file_size": 1835008 + }, + "model.layers.18.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3184558080, + "file_name": ".cache\\MatMulNBits_2_0_492.const", + "file_size": 458752 + }, + "model.layers.19.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3185016832, + "file_name": ".cache\\MatMulNBits_2_0_493.const", + "file_size": 8192 + }, + "model.layers.19.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3185025024, + "file_name": ".cache\\MatMulNBits_2_0_494.const", + "file_size": 20971520 + }, + "model.layers.19.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3205996544, + "file_name": ".cache\\MatMulNBits_2_0_495.const", + "file_size": 20480 + }, + "model.layers.19.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3206017024, + "file_name": ".cache\\MatMulNBits_2_0_496.const", + "file_size": 655360 + }, + "model.layers.19.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3206672384, + "file_name": ".cache\\MatMulNBits_2_0_497.const", + "file_size": 163840 + }, + "model.layers.19.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3206836224, + "file_name": ".cache\\MatMulNBits_2_0_498.const", + "file_size": 4194304 + }, + "model.layers.19.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3211030528, + "file_name": ".cache\\MatMulNBits_2_0_499.const", + "file_size": 4096 + }, + "model.layers.19.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3211034624, + "file_name": ".cache\\MatMulNBits_2_0_500.const", + "file_size": 131072 + }, + "model.layers.19.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3211165696, + "file_name": ".cache\\MatMulNBits_2_0_501.const", + "file_size": 32768 + }, + "model.layers.19.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3211198464, + "file_name": ".cache\\MatMulNBits_2_0_502.const", + "file_size": 16777216 + }, + "model.layers.19.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3227975680, + "file_name": ".cache\\MatMulNBits_2_0_503.const", + "file_size": 16384 + }, + "model.layers.19.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3227992064, + "file_name": ".cache\\MatMulNBits_2_0_504.const", + "file_size": 524288 + }, + "model.layers.19.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3228516352, + "file_name": ".cache\\MatMulNBits_2_0_505.const", + "file_size": 131072 + }, + "model.layers.19.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3228647424, + "file_name": ".cache\\MatMulNBits_2_0_506.const", + "file_size": 8192 + }, + "model.layers.19.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3228655616, + "file_name": ".cache\\MatMulNBits_2_0_507.const", + "file_size": 29360128 + }, + "model.layers.19.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3258015744, + "file_name": ".cache\\MatMulNBits_2_0_508.const", + "file_size": 1835008 + }, + "model.layers.19.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3259850752, + "file_name": ".cache\\MatMulNBits_2_0_509.const", + "file_size": 229376 + }, + "model.layers.19.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3260080128, + "file_name": ".cache\\MatMulNBits_2_0_510.const", + "file_size": 57344 + }, + "model.layers.19.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3260137472, + "file_name": ".cache\\MatMulNBits_2_0_511.const", + "file_size": 29360128 + }, + "model.layers.19.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3289497600, + "file_name": ".cache\\MatMulNBits_2_0_512.const", + "file_size": 1835008 + }, + "model.layers.19.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3291332608, + "file_name": ".cache\\MatMulNBits_2_0_513.const", + "file_size": 229376 + }, + "model.layers.19.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3291561984, + "file_name": ".cache\\MatMulNBits_2_0_514.const", + "file_size": 57344 + }, + "model.layers.19.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3291619328, + "file_name": ".cache\\MatMulNBits_2_0_515.const", + "file_size": 58720256 + }, + "model.layers.19.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3350339584, + "file_name": ".cache\\MatMulNBits_2_0_516.const", + "file_size": 16384 + }, + "model.layers.19.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3350355968, + "file_name": ".cache\\MatMulNBits_2_0_517.const", + "file_size": 1835008 + }, + "model.layers.19.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3352190976, + "file_name": ".cache\\MatMulNBits_2_0_518.const", + "file_size": 458752 + }, + "model.layers.20.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3352649728, + "file_name": ".cache\\MatMulNBits_2_0_519.const", + "file_size": 8192 + }, + "model.layers.20.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3352657920, + "file_name": ".cache\\MatMulNBits_2_0_520.const", + "file_size": 20971520 + }, + "model.layers.20.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3373629440, + "file_name": ".cache\\MatMulNBits_2_0_521.const", + "file_size": 20480 + }, + "model.layers.20.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3373649920, + "file_name": ".cache\\MatMulNBits_2_0_522.const", + "file_size": 655360 + }, + "model.layers.20.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3374305280, + "file_name": ".cache\\MatMulNBits_2_0_523.const", + "file_size": 163840 + }, + "model.layers.20.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3374469120, + "file_name": ".cache\\MatMulNBits_2_0_524.const", + "file_size": 4194304 + }, + "model.layers.20.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3378663424, + "file_name": ".cache\\MatMulNBits_2_0_525.const", + "file_size": 4096 + }, + "model.layers.20.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3378667520, + "file_name": ".cache\\MatMulNBits_2_0_526.const", + "file_size": 131072 + }, + "model.layers.20.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3378798592, + "file_name": ".cache\\MatMulNBits_2_0_527.const", + "file_size": 32768 + }, + "model.layers.20.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3378831360, + "file_name": ".cache\\MatMulNBits_2_0_528.const", + "file_size": 16777216 + }, + "model.layers.20.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3395608576, + "file_name": ".cache\\MatMulNBits_2_0_529.const", + "file_size": 16384 + }, + "model.layers.20.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3395624960, + "file_name": ".cache\\MatMulNBits_2_0_530.const", + "file_size": 524288 + }, + "model.layers.20.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3396149248, + "file_name": ".cache\\MatMulNBits_2_0_531.const", + "file_size": 131072 + }, + "model.layers.20.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3396280320, + "file_name": ".cache\\MatMulNBits_2_0_532.const", + "file_size": 8192 + }, + "model.layers.20.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3396288512, + "file_name": ".cache\\MatMulNBits_2_0_533.const", + "file_size": 29360128 + }, + "model.layers.20.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3425648640, + "file_name": ".cache\\MatMulNBits_2_0_534.const", + "file_size": 1835008 + }, + "model.layers.20.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3427483648, + "file_name": ".cache\\MatMulNBits_2_0_535.const", + "file_size": 229376 + }, + "model.layers.20.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3427713024, + "file_name": ".cache\\MatMulNBits_2_0_536.const", + "file_size": 57344 + }, + "model.layers.20.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3427770368, + "file_name": ".cache\\MatMulNBits_2_0_537.const", + "file_size": 29360128 + }, + "model.layers.20.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3457130496, + "file_name": ".cache\\MatMulNBits_2_0_538.const", + "file_size": 1835008 + }, + "model.layers.20.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3458965504, + "file_name": ".cache\\MatMulNBits_2_0_539.const", + "file_size": 229376 + }, + "model.layers.20.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3459194880, + "file_name": ".cache\\MatMulNBits_2_0_540.const", + "file_size": 57344 + }, + "model.layers.20.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3459252224, + "file_name": ".cache\\MatMulNBits_2_0_541.const", + "file_size": 58720256 + }, + "model.layers.20.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3517972480, + "file_name": ".cache\\MatMulNBits_2_0_542.const", + "file_size": 16384 + }, + "model.layers.20.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3517988864, + "file_name": ".cache\\MatMulNBits_2_0_543.const", + "file_size": 1835008 + }, + "model.layers.20.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3519823872, + "file_name": ".cache\\MatMulNBits_2_0_544.const", + "file_size": 458752 + }, + "model.layers.21.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3520282624, + "file_name": ".cache\\MatMulNBits_2_0_545.const", + "file_size": 8192 + }, + "model.layers.21.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3520290816, + "file_name": ".cache\\MatMulNBits_2_0_546.const", + "file_size": 20971520 + }, + "model.layers.21.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3541262336, + "file_name": ".cache\\MatMulNBits_2_0_547.const", + "file_size": 20480 + }, + "model.layers.21.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3541282816, + "file_name": ".cache\\MatMulNBits_2_0_548.const", + "file_size": 655360 + }, + "model.layers.21.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3541938176, + "file_name": ".cache\\MatMulNBits_2_0_549.const", + "file_size": 163840 + }, + "model.layers.21.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3542102016, + "file_name": ".cache\\MatMulNBits_2_0_550.const", + "file_size": 4194304 + }, + "model.layers.21.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3546296320, + "file_name": ".cache\\MatMulNBits_2_0_551.const", + "file_size": 4096 + }, + "model.layers.21.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3546300416, + "file_name": ".cache\\MatMulNBits_2_0_552.const", + "file_size": 131072 + }, + "model.layers.21.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3546431488, + "file_name": ".cache\\MatMulNBits_2_0_553.const", + "file_size": 32768 + }, + "model.layers.21.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3546464256, + "file_name": ".cache\\MatMulNBits_2_0_554.const", + "file_size": 16777216 + }, + "model.layers.21.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3563241472, + "file_name": ".cache\\MatMulNBits_2_0_555.const", + "file_size": 16384 + }, + "model.layers.21.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3563257856, + "file_name": ".cache\\MatMulNBits_2_0_556.const", + "file_size": 524288 + }, + "model.layers.21.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3563782144, + "file_name": ".cache\\MatMulNBits_2_0_557.const", + "file_size": 131072 + }, + "model.layers.21.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3563913216, + "file_name": ".cache\\MatMulNBits_2_0_558.const", + "file_size": 8192 + }, + "model.layers.21.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3563921408, + "file_name": ".cache\\MatMulNBits_2_0_559.const", + "file_size": 29360128 + }, + "model.layers.21.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3593281536, + "file_name": ".cache\\MatMulNBits_2_0_560.const", + "file_size": 1835008 + }, + "model.layers.21.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3595116544, + "file_name": ".cache\\MatMulNBits_2_0_561.const", + "file_size": 229376 + }, + "model.layers.21.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3595345920, + "file_name": ".cache\\MatMulNBits_2_0_562.const", + "file_size": 57344 + }, + "model.layers.21.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3595403264, + "file_name": ".cache\\MatMulNBits_2_0_563.const", + "file_size": 29360128 + }, + "model.layers.21.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3624763392, + "file_name": ".cache\\MatMulNBits_2_0_564.const", + "file_size": 1835008 + }, + "model.layers.21.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3626598400, + "file_name": ".cache\\MatMulNBits_2_0_565.const", + "file_size": 229376 + }, + "model.layers.21.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3626827776, + "file_name": ".cache\\MatMulNBits_2_0_566.const", + "file_size": 57344 + }, + "model.layers.21.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3626885120, + "file_name": ".cache\\MatMulNBits_2_0_567.const", + "file_size": 58720256 + }, + "model.layers.21.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3685605376, + "file_name": ".cache\\MatMulNBits_2_0_568.const", + "file_size": 16384 + }, + "model.layers.21.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3685621760, + "file_name": ".cache\\MatMulNBits_2_0_569.const", + "file_size": 1835008 + }, + "model.layers.21.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3687456768, + "file_name": ".cache\\MatMulNBits_2_0_570.const", + "file_size": 458752 + }, + "model.layers.22.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3687915520, + "file_name": ".cache\\MatMulNBits_2_0_571.const", + "file_size": 8192 + }, + "model.layers.22.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3687923712, + "file_name": ".cache\\MatMulNBits_2_0_572.const", + "file_size": 20971520 + }, + "model.layers.22.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3708895232, + "file_name": ".cache\\MatMulNBits_2_0_573.const", + "file_size": 20480 + }, + "model.layers.22.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3708915712, + "file_name": ".cache\\MatMulNBits_2_0_574.const", + "file_size": 655360 + }, + "model.layers.22.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3709571072, + "file_name": ".cache\\MatMulNBits_2_0_575.const", + "file_size": 163840 + }, + "model.layers.22.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3709734912, + "file_name": ".cache\\MatMulNBits_2_0_576.const", + "file_size": 4194304 + }, + "model.layers.22.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3713929216, + "file_name": ".cache\\MatMulNBits_2_0_577.const", + "file_size": 4096 + }, + "model.layers.22.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3713933312, + "file_name": ".cache\\MatMulNBits_2_0_578.const", + "file_size": 131072 + }, + "model.layers.22.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3714064384, + "file_name": ".cache\\MatMulNBits_2_0_579.const", + "file_size": 32768 + }, + "model.layers.22.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3714097152, + "file_name": ".cache\\MatMulNBits_2_0_580.const", + "file_size": 16777216 + }, + "model.layers.22.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3730874368, + "file_name": ".cache\\MatMulNBits_2_0_581.const", + "file_size": 16384 + }, + "model.layers.22.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3730890752, + "file_name": ".cache\\MatMulNBits_2_0_582.const", + "file_size": 524288 + }, + "model.layers.22.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3731415040, + "file_name": ".cache\\MatMulNBits_2_0_583.const", + "file_size": 131072 + }, + "model.layers.22.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3731546112, + "file_name": ".cache\\MatMulNBits_2_0_584.const", + "file_size": 8192 + }, + "model.layers.22.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3731554304, + "file_name": ".cache\\MatMulNBits_2_0_585.const", + "file_size": 29360128 + }, + "model.layers.22.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3760914432, + "file_name": ".cache\\MatMulNBits_2_0_586.const", + "file_size": 1835008 + }, + "model.layers.22.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3762749440, + "file_name": ".cache\\MatMulNBits_2_0_587.const", + "file_size": 229376 + }, + "model.layers.22.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3762978816, + "file_name": ".cache\\MatMulNBits_2_0_588.const", + "file_size": 57344 + }, + "model.layers.22.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3763036160, + "file_name": ".cache\\MatMulNBits_2_0_589.const", + "file_size": 29360128 + }, + "model.layers.22.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3792396288, + "file_name": ".cache\\MatMulNBits_2_0_590.const", + "file_size": 1835008 + }, + "model.layers.22.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3794231296, + "file_name": ".cache\\MatMulNBits_2_0_591.const", + "file_size": 229376 + }, + "model.layers.22.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3794460672, + "file_name": ".cache\\MatMulNBits_2_0_592.const", + "file_size": 57344 + }, + "model.layers.22.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3794518016, + "file_name": ".cache\\MatMulNBits_2_0_593.const", + "file_size": 58720256 + }, + "model.layers.22.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3853238272, + "file_name": ".cache\\MatMulNBits_2_0_594.const", + "file_size": 16384 + }, + "model.layers.22.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3853254656, + "file_name": ".cache\\MatMulNBits_2_0_595.const", + "file_size": 1835008 + }, + "model.layers.22.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 3855089664, + "file_name": ".cache\\MatMulNBits_2_0_596.const", + "file_size": 458752 + }, + "model.layers.23.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3855548416, + "file_name": ".cache\\MatMulNBits_2_0_597.const", + "file_size": 8192 + }, + "model.layers.23.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 3855556608, + "file_name": ".cache\\MatMulNBits_2_0_598.const", + "file_size": 20971520 + }, + "model.layers.23.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 3876528128, + "file_name": ".cache\\MatMulNBits_2_0_599.const", + "file_size": 20480 + }, + "model.layers.23.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 3876548608, + "file_name": ".cache\\MatMulNBits_2_0_600.const", + "file_size": 655360 + }, + "model.layers.23.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 3877203968, + "file_name": ".cache\\MatMulNBits_2_0_601.const", + "file_size": 163840 + }, + "model.layers.23.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 3877367808, + "file_name": ".cache\\MatMulNBits_2_0_602.const", + "file_size": 4194304 + }, + "model.layers.23.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 3881562112, + "file_name": ".cache\\MatMulNBits_2_0_603.const", + "file_size": 4096 + }, + "model.layers.23.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3881566208, + "file_name": ".cache\\MatMulNBits_2_0_604.const", + "file_size": 131072 + }, + "model.layers.23.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 3881697280, + "file_name": ".cache\\MatMulNBits_2_0_605.const", + "file_size": 32768 + }, + "model.layers.23.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 3881730048, + "file_name": ".cache\\MatMulNBits_2_0_606.const", + "file_size": 16777216 + }, + "model.layers.23.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 3898507264, + "file_name": ".cache\\MatMulNBits_2_0_607.const", + "file_size": 16384 + }, + "model.layers.23.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 3898523648, + "file_name": ".cache\\MatMulNBits_2_0_608.const", + "file_size": 524288 + }, + "model.layers.23.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 3899047936, + "file_name": ".cache\\MatMulNBits_2_0_609.const", + "file_size": 131072 + }, + "model.layers.23.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 3899179008, + "file_name": ".cache\\MatMulNBits_2_0_610.const", + "file_size": 8192 + }, + "model.layers.23.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3899187200, + "file_name": ".cache\\MatMulNBits_2_0_611.const", + "file_size": 29360128 + }, + "model.layers.23.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3928547328, + "file_name": ".cache\\MatMulNBits_2_0_612.const", + "file_size": 1835008 + }, + "model.layers.23.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3930382336, + "file_name": ".cache\\MatMulNBits_2_0_613.const", + "file_size": 229376 + }, + "model.layers.23.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3930611712, + "file_name": ".cache\\MatMulNBits_2_0_614.const", + "file_size": 57344 + }, + "model.layers.23.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 3930669056, + "file_name": ".cache\\MatMulNBits_2_0_615.const", + "file_size": 29360128 + }, + "model.layers.23.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 3960029184, + "file_name": ".cache\\MatMulNBits_2_0_616.const", + "file_size": 1835008 + }, + "model.layers.23.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 3961864192, + "file_name": ".cache\\MatMulNBits_2_0_617.const", + "file_size": 229376 + }, + "model.layers.23.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 3962093568, + "file_name": ".cache\\MatMulNBits_2_0_618.const", + "file_size": 57344 + }, + "model.layers.23.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 3962150912, + "file_name": ".cache\\MatMulNBits_2_0_619.const", + "file_size": 58720256 + }, + "model.layers.23.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4020871168, + "file_name": ".cache\\MatMulNBits_2_0_620.const", + "file_size": 16384 + }, + "model.layers.23.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4020887552, + "file_name": ".cache\\MatMulNBits_2_0_621.const", + "file_size": 1835008 + }, + "model.layers.23.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4022722560, + "file_name": ".cache\\MatMulNBits_2_0_622.const", + "file_size": 458752 + }, + "model.layers.24.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4023181312, + "file_name": ".cache\\MatMulNBits_2_0_623.const", + "file_size": 8192 + }, + "model.layers.24.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4023189504, + "file_name": ".cache\\MatMulNBits_2_0_624.const", + "file_size": 20971520 + }, + "model.layers.24.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4044161024, + "file_name": ".cache\\MatMulNBits_2_0_625.const", + "file_size": 20480 + }, + "model.layers.24.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4044181504, + "file_name": ".cache\\MatMulNBits_2_0_626.const", + "file_size": 655360 + }, + "model.layers.24.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4044836864, + "file_name": ".cache\\MatMulNBits_2_0_627.const", + "file_size": 163840 + }, + "model.layers.24.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4045000704, + "file_name": ".cache\\MatMulNBits_2_0_628.const", + "file_size": 4194304 + }, + "model.layers.24.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4049195008, + "file_name": ".cache\\MatMulNBits_2_0_629.const", + "file_size": 4096 + }, + "model.layers.24.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4049199104, + "file_name": ".cache\\MatMulNBits_2_0_630.const", + "file_size": 131072 + }, + "model.layers.24.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4049330176, + "file_name": ".cache\\MatMulNBits_2_0_631.const", + "file_size": 32768 + }, + "model.layers.24.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4049362944, + "file_name": ".cache\\MatMulNBits_2_0_632.const", + "file_size": 16777216 + }, + "model.layers.24.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4066140160, + "file_name": ".cache\\MatMulNBits_2_0_633.const", + "file_size": 16384 + }, + "model.layers.24.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4066156544, + "file_name": ".cache\\MatMulNBits_2_0_634.const", + "file_size": 524288 + }, + "model.layers.24.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4066680832, + "file_name": ".cache\\MatMulNBits_2_0_635.const", + "file_size": 131072 + }, + "model.layers.24.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4066811904, + "file_name": ".cache\\MatMulNBits_2_0_636.const", + "file_size": 8192 + }, + "model.layers.24.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4066820096, + "file_name": ".cache\\MatMulNBits_2_0_637.const", + "file_size": 29360128 + }, + "model.layers.24.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4096180224, + "file_name": ".cache\\MatMulNBits_2_0_638.const", + "file_size": 1835008 + }, + "model.layers.24.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4098015232, + "file_name": ".cache\\MatMulNBits_2_0_639.const", + "file_size": 229376 + }, + "model.layers.24.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4098244608, + "file_name": ".cache\\MatMulNBits_2_0_640.const", + "file_size": 57344 + }, + "model.layers.24.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4098301952, + "file_name": ".cache\\MatMulNBits_2_0_641.const", + "file_size": 29360128 + }, + "model.layers.24.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4127662080, + "file_name": ".cache\\MatMulNBits_2_0_642.const", + "file_size": 1835008 + }, + "model.layers.24.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4129497088, + "file_name": ".cache\\MatMulNBits_2_0_643.const", + "file_size": 229376 + }, + "model.layers.24.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4129726464, + "file_name": ".cache\\MatMulNBits_2_0_644.const", + "file_size": 57344 + }, + "model.layers.24.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4129783808, + "file_name": ".cache\\MatMulNBits_2_0_645.const", + "file_size": 58720256 + }, + "model.layers.24.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4188504064, + "file_name": ".cache\\MatMulNBits_2_0_646.const", + "file_size": 16384 + }, + "model.layers.24.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4188520448, + "file_name": ".cache\\MatMulNBits_2_0_647.const", + "file_size": 1835008 + }, + "model.layers.24.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4190355456, + "file_name": ".cache\\MatMulNBits_2_0_648.const", + "file_size": 458752 + }, + "model.layers.25.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4190814208, + "file_name": ".cache\\MatMulNBits_2_0_649.const", + "file_size": 8192 + }, + "model.layers.25.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4190822400, + "file_name": ".cache\\MatMulNBits_2_0_650.const", + "file_size": 20971520 + }, + "model.layers.25.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4211793920, + "file_name": ".cache\\MatMulNBits_2_0_651.const", + "file_size": 20480 + }, + "model.layers.25.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4211814400, + "file_name": ".cache\\MatMulNBits_2_0_652.const", + "file_size": 655360 + }, + "model.layers.25.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4212469760, + "file_name": ".cache\\MatMulNBits_2_0_653.const", + "file_size": 163840 + }, + "model.layers.25.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4212633600, + "file_name": ".cache\\MatMulNBits_2_0_654.const", + "file_size": 4194304 + }, + "model.layers.25.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4216827904, + "file_name": ".cache\\MatMulNBits_2_0_655.const", + "file_size": 4096 + }, + "model.layers.25.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4216832000, + "file_name": ".cache\\MatMulNBits_2_0_656.const", + "file_size": 131072 + }, + "model.layers.25.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4216963072, + "file_name": ".cache\\MatMulNBits_2_0_657.const", + "file_size": 32768 + }, + "model.layers.25.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4216995840, + "file_name": ".cache\\MatMulNBits_2_0_658.const", + "file_size": 16777216 + }, + "model.layers.25.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4233773056, + "file_name": ".cache\\MatMulNBits_2_0_659.const", + "file_size": 16384 + }, + "model.layers.25.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4233789440, + "file_name": ".cache\\MatMulNBits_2_0_660.const", + "file_size": 524288 + }, + "model.layers.25.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4234313728, + "file_name": ".cache\\MatMulNBits_2_0_661.const", + "file_size": 131072 + }, + "model.layers.25.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4234444800, + "file_name": ".cache\\MatMulNBits_2_0_662.const", + "file_size": 8192 + }, + "model.layers.25.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4234452992, + "file_name": ".cache\\MatMulNBits_2_0_663.const", + "file_size": 29360128 + }, + "model.layers.25.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4263813120, + "file_name": ".cache\\MatMulNBits_2_0_664.const", + "file_size": 1835008 + }, + "model.layers.25.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4265648128, + "file_name": ".cache\\MatMulNBits_2_0_665.const", + "file_size": 229376 + }, + "model.layers.25.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4265877504, + "file_name": ".cache\\MatMulNBits_2_0_666.const", + "file_size": 57344 + }, + "model.layers.25.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4265934848, + "file_name": ".cache\\MatMulNBits_2_0_667.const", + "file_size": 29360128 + }, + "model.layers.25.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4295294976, + "file_name": ".cache\\MatMulNBits_2_0_668.const", + "file_size": 1835008 + }, + "model.layers.25.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4297129984, + "file_name": ".cache\\MatMulNBits_2_0_669.const", + "file_size": 229376 + }, + "model.layers.25.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4297359360, + "file_name": ".cache\\MatMulNBits_2_0_670.const", + "file_size": 57344 + }, + "model.layers.25.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4297416704, + "file_name": ".cache\\MatMulNBits_2_0_671.const", + "file_size": 58720256 + }, + "model.layers.25.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4356136960, + "file_name": ".cache\\MatMulNBits_2_0_672.const", + "file_size": 16384 + }, + "model.layers.25.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4356153344, + "file_name": ".cache\\MatMulNBits_2_0_673.const", + "file_size": 1835008 + }, + "model.layers.25.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4357988352, + "file_name": ".cache\\MatMulNBits_2_0_674.const", + "file_size": 458752 + }, + "model.layers.26.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4358447104, + "file_name": ".cache\\MatMulNBits_2_0_675.const", + "file_size": 8192 + }, + "model.layers.26.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4358455296, + "file_name": ".cache\\MatMulNBits_2_0_676.const", + "file_size": 20971520 + }, + "model.layers.26.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4379426816, + "file_name": ".cache\\MatMulNBits_2_0_677.const", + "file_size": 20480 + }, + "model.layers.26.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4379447296, + "file_name": ".cache\\MatMulNBits_2_0_678.const", + "file_size": 655360 + }, + "model.layers.26.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4380102656, + "file_name": ".cache\\MatMulNBits_2_0_679.const", + "file_size": 163840 + }, + "model.layers.26.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4380266496, + "file_name": ".cache\\MatMulNBits_2_0_680.const", + "file_size": 4194304 + }, + "model.layers.26.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4384460800, + "file_name": ".cache\\MatMulNBits_2_0_681.const", + "file_size": 4096 + }, + "model.layers.26.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4384464896, + "file_name": ".cache\\MatMulNBits_2_0_682.const", + "file_size": 131072 + }, + "model.layers.26.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4384595968, + "file_name": ".cache\\MatMulNBits_2_0_683.const", + "file_size": 32768 + }, + "model.layers.26.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4384628736, + "file_name": ".cache\\MatMulNBits_2_0_684.const", + "file_size": 16777216 + }, + "model.layers.26.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4401405952, + "file_name": ".cache\\MatMulNBits_2_0_685.const", + "file_size": 16384 + }, + "model.layers.26.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4401422336, + "file_name": ".cache\\MatMulNBits_2_0_686.const", + "file_size": 524288 + }, + "model.layers.26.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4401946624, + "file_name": ".cache\\MatMulNBits_2_0_687.const", + "file_size": 131072 + }, + "model.layers.26.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4402077696, + "file_name": ".cache\\MatMulNBits_2_0_688.const", + "file_size": 8192 + }, + "model.layers.26.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4402085888, + "file_name": ".cache\\MatMulNBits_2_0_689.const", + "file_size": 29360128 + }, + "model.layers.26.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4431446016, + "file_name": ".cache\\MatMulNBits_2_0_690.const", + "file_size": 1835008 + }, + "model.layers.26.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4433281024, + "file_name": ".cache\\MatMulNBits_2_0_691.const", + "file_size": 229376 + }, + "model.layers.26.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4433510400, + "file_name": ".cache\\MatMulNBits_2_0_692.const", + "file_size": 57344 + }, + "model.layers.26.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4433567744, + "file_name": ".cache\\MatMulNBits_2_0_693.const", + "file_size": 29360128 + }, + "model.layers.26.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4462927872, + "file_name": ".cache\\MatMulNBits_2_0_694.const", + "file_size": 1835008 + }, + "model.layers.26.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4464762880, + "file_name": ".cache\\MatMulNBits_2_0_695.const", + "file_size": 229376 + }, + "model.layers.26.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4464992256, + "file_name": ".cache\\MatMulNBits_2_0_696.const", + "file_size": 57344 + }, + "model.layers.26.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4465049600, + "file_name": ".cache\\MatMulNBits_2_0_697.const", + "file_size": 58720256 + }, + "model.layers.26.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4523769856, + "file_name": ".cache\\MatMulNBits_2_0_698.const", + "file_size": 16384 + }, + "model.layers.26.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4523786240, + "file_name": ".cache\\MatMulNBits_2_0_699.const", + "file_size": 1835008 + }, + "model.layers.26.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4525621248, + "file_name": ".cache\\MatMulNBits_2_0_700.const", + "file_size": 458752 + }, + "model.layers.27.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4526080000, + "file_name": ".cache\\MatMulNBits_2_0_701.const", + "file_size": 8192 + }, + "model.layers.27.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4526088192, + "file_name": ".cache\\MatMulNBits_2_0_702.const", + "file_size": 20971520 + }, + "model.layers.27.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4547059712, + "file_name": ".cache\\MatMulNBits_2_0_703.const", + "file_size": 20480 + }, + "model.layers.27.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4547080192, + "file_name": ".cache\\MatMulNBits_2_0_704.const", + "file_size": 655360 + }, + "model.layers.27.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4547735552, + "file_name": ".cache\\MatMulNBits_2_0_705.const", + "file_size": 163840 + }, + "model.layers.27.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4547899392, + "file_name": ".cache\\MatMulNBits_2_0_706.const", + "file_size": 4194304 + }, + "model.layers.27.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4552093696, + "file_name": ".cache\\MatMulNBits_2_0_707.const", + "file_size": 4096 + }, + "model.layers.27.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4552097792, + "file_name": ".cache\\MatMulNBits_2_0_708.const", + "file_size": 131072 + }, + "model.layers.27.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4552228864, + "file_name": ".cache\\MatMulNBits_2_0_709.const", + "file_size": 32768 + }, + "model.layers.27.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4552261632, + "file_name": ".cache\\MatMulNBits_2_0_710.const", + "file_size": 16777216 + }, + "model.layers.27.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4569038848, + "file_name": ".cache\\MatMulNBits_2_0_711.const", + "file_size": 16384 + }, + "model.layers.27.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4569055232, + "file_name": ".cache\\MatMulNBits_2_0_712.const", + "file_size": 524288 + }, + "model.layers.27.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4569579520, + "file_name": ".cache\\MatMulNBits_2_0_713.const", + "file_size": 131072 + }, + "model.layers.27.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4569710592, + "file_name": ".cache\\MatMulNBits_2_0_714.const", + "file_size": 8192 + }, + "model.layers.27.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4569718784, + "file_name": ".cache\\MatMulNBits_2_0_715.const", + "file_size": 29360128 + }, + "model.layers.27.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4599078912, + "file_name": ".cache\\MatMulNBits_2_0_716.const", + "file_size": 1835008 + }, + "model.layers.27.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4600913920, + "file_name": ".cache\\MatMulNBits_2_0_717.const", + "file_size": 229376 + }, + "model.layers.27.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4601143296, + "file_name": ".cache\\MatMulNBits_2_0_718.const", + "file_size": 57344 + }, + "model.layers.27.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4601200640, + "file_name": ".cache\\MatMulNBits_2_0_719.const", + "file_size": 29360128 + }, + "model.layers.27.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4630560768, + "file_name": ".cache\\MatMulNBits_2_0_720.const", + "file_size": 1835008 + }, + "model.layers.27.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4632395776, + "file_name": ".cache\\MatMulNBits_2_0_721.const", + "file_size": 229376 + }, + "model.layers.27.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4632625152, + "file_name": ".cache\\MatMulNBits_2_0_722.const", + "file_size": 57344 + }, + "model.layers.27.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4632682496, + "file_name": ".cache\\MatMulNBits_2_0_723.const", + "file_size": 58720256 + }, + "model.layers.27.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4691402752, + "file_name": ".cache\\MatMulNBits_2_0_724.const", + "file_size": 16384 + }, + "model.layers.27.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4691419136, + "file_name": ".cache\\MatMulNBits_2_0_725.const", + "file_size": 1835008 + }, + "model.layers.27.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4693254144, + "file_name": ".cache\\MatMulNBits_2_0_726.const", + "file_size": 458752 + }, + "model.layers.28.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4693712896, + "file_name": ".cache\\MatMulNBits_2_0_727.const", + "file_size": 8192 + }, + "model.layers.28.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4693721088, + "file_name": ".cache\\MatMulNBits_2_0_728.const", + "file_size": 20971520 + }, + "model.layers.28.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4714692608, + "file_name": ".cache\\MatMulNBits_2_0_729.const", + "file_size": 20480 + }, + "model.layers.28.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4714713088, + "file_name": ".cache\\MatMulNBits_2_0_730.const", + "file_size": 655360 + }, + "model.layers.28.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4715368448, + "file_name": ".cache\\MatMulNBits_2_0_731.const", + "file_size": 163840 + }, + "model.layers.28.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4715532288, + "file_name": ".cache\\MatMulNBits_2_0_732.const", + "file_size": 4194304 + }, + "model.layers.28.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4719726592, + "file_name": ".cache\\MatMulNBits_2_0_733.const", + "file_size": 4096 + }, + "model.layers.28.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4719730688, + "file_name": ".cache\\MatMulNBits_2_0_734.const", + "file_size": 131072 + }, + "model.layers.28.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4719861760, + "file_name": ".cache\\MatMulNBits_2_0_735.const", + "file_size": 32768 + }, + "model.layers.28.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4719894528, + "file_name": ".cache\\MatMulNBits_2_0_736.const", + "file_size": 16777216 + }, + "model.layers.28.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4736671744, + "file_name": ".cache\\MatMulNBits_2_0_737.const", + "file_size": 16384 + }, + "model.layers.28.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4736688128, + "file_name": ".cache\\MatMulNBits_2_0_738.const", + "file_size": 524288 + }, + "model.layers.28.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4737212416, + "file_name": ".cache\\MatMulNBits_2_0_739.const", + "file_size": 131072 + }, + "model.layers.28.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4737343488, + "file_name": ".cache\\MatMulNBits_2_0_740.const", + "file_size": 8192 + }, + "model.layers.28.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4737351680, + "file_name": ".cache\\MatMulNBits_2_0_741.const", + "file_size": 29360128 + }, + "model.layers.28.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4766711808, + "file_name": ".cache\\MatMulNBits_2_0_742.const", + "file_size": 1835008 + }, + "model.layers.28.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4768546816, + "file_name": ".cache\\MatMulNBits_2_0_743.const", + "file_size": 229376 + }, + "model.layers.28.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4768776192, + "file_name": ".cache\\MatMulNBits_2_0_744.const", + "file_size": 57344 + }, + "model.layers.28.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4768833536, + "file_name": ".cache\\MatMulNBits_2_0_745.const", + "file_size": 29360128 + }, + "model.layers.28.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4798193664, + "file_name": ".cache\\MatMulNBits_2_0_746.const", + "file_size": 1835008 + }, + "model.layers.28.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4800028672, + "file_name": ".cache\\MatMulNBits_2_0_747.const", + "file_size": 229376 + }, + "model.layers.28.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4800258048, + "file_name": ".cache\\MatMulNBits_2_0_748.const", + "file_size": 57344 + }, + "model.layers.28.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4800315392, + "file_name": ".cache\\MatMulNBits_2_0_749.const", + "file_size": 58720256 + }, + "model.layers.28.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4859035648, + "file_name": ".cache\\MatMulNBits_2_0_750.const", + "file_size": 16384 + }, + "model.layers.28.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4859052032, + "file_name": ".cache\\MatMulNBits_2_0_751.const", + "file_size": 1835008 + }, + "model.layers.28.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 4860887040, + "file_name": ".cache\\MatMulNBits_2_0_752.const", + "file_size": 458752 + }, + "model.layers.29.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4861345792, + "file_name": ".cache\\MatMulNBits_2_0_753.const", + "file_size": 8192 + }, + "model.layers.29.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 4861353984, + "file_name": ".cache\\MatMulNBits_2_0_754.const", + "file_size": 20971520 + }, + "model.layers.29.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 4882325504, + "file_name": ".cache\\MatMulNBits_2_0_755.const", + "file_size": 20480 + }, + "model.layers.29.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 4882345984, + "file_name": ".cache\\MatMulNBits_2_0_756.const", + "file_size": 655360 + }, + "model.layers.29.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 4883001344, + "file_name": ".cache\\MatMulNBits_2_0_757.const", + "file_size": 163840 + }, + "model.layers.29.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 4883165184, + "file_name": ".cache\\MatMulNBits_2_0_758.const", + "file_size": 4194304 + }, + "model.layers.29.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 4887359488, + "file_name": ".cache\\MatMulNBits_2_0_759.const", + "file_size": 4096 + }, + "model.layers.29.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4887363584, + "file_name": ".cache\\MatMulNBits_2_0_760.const", + "file_size": 131072 + }, + "model.layers.29.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 4887494656, + "file_name": ".cache\\MatMulNBits_2_0_761.const", + "file_size": 32768 + }, + "model.layers.29.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 4887527424, + "file_name": ".cache\\MatMulNBits_2_0_762.const", + "file_size": 16777216 + }, + "model.layers.29.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 4904304640, + "file_name": ".cache\\MatMulNBits_2_0_763.const", + "file_size": 16384 + }, + "model.layers.29.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 4904321024, + "file_name": ".cache\\MatMulNBits_2_0_764.const", + "file_size": 524288 + }, + "model.layers.29.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 4904845312, + "file_name": ".cache\\MatMulNBits_2_0_765.const", + "file_size": 131072 + }, + "model.layers.29.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 4904976384, + "file_name": ".cache\\MatMulNBits_2_0_766.const", + "file_size": 8192 + }, + "model.layers.29.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4904984576, + "file_name": ".cache\\MatMulNBits_2_0_767.const", + "file_size": 29360128 + }, + "model.layers.29.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4934344704, + "file_name": ".cache\\MatMulNBits_2_0_768.const", + "file_size": 1835008 + }, + "model.layers.29.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4936179712, + "file_name": ".cache\\MatMulNBits_2_0_769.const", + "file_size": 229376 + }, + "model.layers.29.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4936409088, + "file_name": ".cache\\MatMulNBits_2_0_770.const", + "file_size": 57344 + }, + "model.layers.29.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 4936466432, + "file_name": ".cache\\MatMulNBits_2_0_771.const", + "file_size": 29360128 + }, + "model.layers.29.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 4965826560, + "file_name": ".cache\\MatMulNBits_2_0_772.const", + "file_size": 1835008 + }, + "model.layers.29.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 4967661568, + "file_name": ".cache\\MatMulNBits_2_0_773.const", + "file_size": 229376 + }, + "model.layers.29.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 4967890944, + "file_name": ".cache\\MatMulNBits_2_0_774.const", + "file_size": 57344 + }, + "model.layers.29.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 4967948288, + "file_name": ".cache\\MatMulNBits_2_0_775.const", + "file_size": 58720256 + }, + "model.layers.29.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 5026668544, + "file_name": ".cache\\MatMulNBits_2_0_776.const", + "file_size": 16384 + }, + "model.layers.29.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5026684928, + "file_name": ".cache\\MatMulNBits_2_0_777.const", + "file_size": 1835008 + }, + "model.layers.29.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 5028519936, + "file_name": ".cache\\MatMulNBits_2_0_778.const", + "file_size": 458752 + }, + "model.layers.30.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 5028978688, + "file_name": ".cache\\MatMulNBits_2_0_779.const", + "file_size": 8192 + }, + "model.layers.30.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 5028986880, + "file_name": ".cache\\MatMulNBits_2_0_780.const", + "file_size": 20971520 + }, + "model.layers.30.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 5049958400, + "file_name": ".cache\\MatMulNBits_2_0_781.const", + "file_size": 20480 + }, + "model.layers.30.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 5049978880, + "file_name": ".cache\\MatMulNBits_2_0_782.const", + "file_size": 655360 + }, + "model.layers.30.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 5050634240, + "file_name": ".cache\\MatMulNBits_2_0_783.const", + "file_size": 163840 + }, + "model.layers.30.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 5050798080, + "file_name": ".cache\\MatMulNBits_2_0_784.const", + "file_size": 4194304 + }, + "model.layers.30.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 5054992384, + "file_name": ".cache\\MatMulNBits_2_0_785.const", + "file_size": 4096 + }, + "model.layers.30.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 5054996480, + "file_name": ".cache\\MatMulNBits_2_0_786.const", + "file_size": 131072 + }, + "model.layers.30.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 5055127552, + "file_name": ".cache\\MatMulNBits_2_0_787.const", + "file_size": 32768 + }, + "model.layers.30.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 5055160320, + "file_name": ".cache\\MatMulNBits_2_0_788.const", + "file_size": 16777216 + }, + "model.layers.30.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 5071937536, + "file_name": ".cache\\MatMulNBits_2_0_789.const", + "file_size": 16384 + }, + "model.layers.30.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 5071953920, + "file_name": ".cache\\MatMulNBits_2_0_790.const", + "file_size": 524288 + }, + "model.layers.30.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 5072478208, + "file_name": ".cache\\MatMulNBits_2_0_791.const", + "file_size": 131072 + }, + "model.layers.30.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 5072609280, + "file_name": ".cache\\MatMulNBits_2_0_792.const", + "file_size": 8192 + }, + "model.layers.30.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 5072617472, + "file_name": ".cache\\MatMulNBits_2_0_793.const", + "file_size": 29360128 + }, + "model.layers.30.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5101977600, + "file_name": ".cache\\MatMulNBits_2_0_794.const", + "file_size": 1835008 + }, + "model.layers.30.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 5103812608, + "file_name": ".cache\\MatMulNBits_2_0_795.const", + "file_size": 229376 + }, + "model.layers.30.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 5104041984, + "file_name": ".cache\\MatMulNBits_2_0_796.const", + "file_size": 57344 + }, + "model.layers.30.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 5104099328, + "file_name": ".cache\\MatMulNBits_2_0_797.const", + "file_size": 29360128 + }, + "model.layers.30.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5133459456, + "file_name": ".cache\\MatMulNBits_2_0_798.const", + "file_size": 1835008 + }, + "model.layers.30.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 5135294464, + "file_name": ".cache\\MatMulNBits_2_0_799.const", + "file_size": 229376 + }, + "model.layers.30.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 5135523840, + "file_name": ".cache\\MatMulNBits_2_0_800.const", + "file_size": 57344 + }, + "model.layers.30.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 5135581184, + "file_name": ".cache\\MatMulNBits_2_0_801.const", + "file_size": 58720256 + }, + "model.layers.30.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 5194301440, + "file_name": ".cache\\MatMulNBits_2_0_802.const", + "file_size": 16384 + }, + "model.layers.30.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5194317824, + "file_name": ".cache\\MatMulNBits_2_0_803.const", + "file_size": 1835008 + }, + "model.layers.30.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 5196152832, + "file_name": ".cache\\MatMulNBits_2_0_804.const", + "file_size": 458752 + }, + "model.layers.31.input_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 5196611584, + "file_name": ".cache\\MatMulNBits_2_0_805.const", + "file_size": 8192 + }, + "model.layers.31.attn.qk_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 5120 + ], + "size_in_bytes": 20971520, + "op_tensor_size": 20971520, + "offset": 5196619776, + "file_name": ".cache\\MatMulNBits_2_0_806.const", + "file_size": 20971520 + }, + "model.layers.31.attn.qk_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 5120 + ], + "size_in_bytes": 20480, + "op_tensor_size": 20480, + "offset": 5217591296, + "file_name": ".cache\\MatMulNBits_2_0_807.const", + "file_size": 20480 + }, + "model.layers.31.attn.qk_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 163840 + ], + "size_in_bytes": 655360, + "op_tensor_size": 655360, + "offset": 5217611776, + "file_name": ".cache\\MatMulNBits_2_0_808.const", + "file_size": 655360 + }, + "model.layers.31.attn.qk_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 163840 + ], + "size_in_bytes": 163840, + "op_tensor_size": 163840, + "offset": 5218267136, + "file_name": ".cache\\MatMulNBits_2_0_809.const", + "file_size": 163840 + }, + "model.layers.31.attn.v_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 1024 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 5218430976, + "file_name": ".cache\\MatMulNBits_2_0_810.const", + "file_size": 4194304 + }, + "model.layers.31.attn.v_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1024 + ], + "size_in_bytes": 4096, + "op_tensor_size": 4096, + "offset": 5222625280, + "file_name": ".cache\\MatMulNBits_2_0_811.const", + "file_size": 4096 + }, + "model.layers.31.attn.v_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 5222629376, + "file_name": ".cache\\MatMulNBits_2_0_812.const", + "file_size": 131072 + }, + "model.layers.31.attn.v_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 32768 + ], + "size_in_bytes": 32768, + "op_tensor_size": 32768, + "offset": 5222760448, + "file_name": ".cache\\MatMulNBits_2_0_813.const", + "file_size": 32768 + }, + "model.layers.31.attn.o_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 4096 + ], + "size_in_bytes": 16777216, + "op_tensor_size": 16777216, + "offset": 5222793216, + "file_name": ".cache\\MatMulNBits_2_0_814.const", + "file_size": 16777216 + }, + "model.layers.31.attn.o_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 5239570432, + "file_name": ".cache\\MatMulNBits_2_0_815.const", + "file_size": 16384 + }, + "model.layers.31.attn.o_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 131072 + ], + "size_in_bytes": 524288, + "op_tensor_size": 524288, + "offset": 5239586816, + "file_name": ".cache\\MatMulNBits_2_0_816.const", + "file_size": 524288 + }, + "model.layers.31.attn.o_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 131072 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 5240111104, + "file_name": ".cache\\MatMulNBits_2_0_817.const", + "file_size": 131072 + }, + "model.layers.31.post_attention_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 5240242176, + "file_name": ".cache\\MatMulNBits_2_0_818.const", + "file_size": 8192 + }, + "model.layers.31.mlp.gate_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 5240250368, + "file_name": ".cache\\MatMulNBits_2_0_819.const", + "file_size": 29360128 + }, + "model.layers.31.mlp.gate_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5269610496, + "file_name": ".cache\\MatMulNBits_2_0_820.const", + "file_size": 1835008 + }, + "model.layers.31.mlp.gate_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 5271445504, + "file_name": ".cache\\MatMulNBits_2_0_821.const", + "file_size": 229376 + }, + "model.layers.31.mlp.gate_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 5271674880, + "file_name": ".cache\\MatMulNBits_2_0_822.const", + "file_size": 57344 + }, + "model.layers.31.mlp.up_proj.MatMulNBits.qweight": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 14336, + 32, + 64 + ], + "size_in_bytes": 29360128, + "op_tensor_size": 29360128, + "offset": 5271732224, + "file_name": ".cache\\MatMulNBits_2_0_823.const", + "file_size": 29360128 + }, + "model.layers.31.mlp.up_proj.MatMulNBits.scales.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5301092352, + "file_name": ".cache\\MatMulNBits_2_0_824.const", + "file_size": 1835008 + }, + "model.layers.31.mlp.up_proj.MatMulNBits.qzeros": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "uint8", + "shape": [ + 229376 + ], + "size_in_bytes": 229376, + "op_tensor_size": 229376, + "offset": 5302927360, + "file_name": ".cache\\MatMulNBits_2_0_825.const", + "file_size": 229376 + }, + "model.layers.31.mlp.up_proj.MatMulNBits.bias.f": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 14336 + ], + "size_in_bytes": 57344, + "op_tensor_size": 57344, + "offset": 5303156736, + "file_name": ".cache\\MatMulNBits_2_0_826.const", + "file_size": 57344 + }, + "model.layers.31.mlp.down_proj.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 14336, + 4096 + ], + "size_in_bytes": 58720256, + "op_tensor_size": 58720256, + "offset": 5303214080, + "file_name": ".cache\\MatMulNBits_2_0_827.const", + "file_size": 58720256 + }, + "model.layers.31.mlp.down_proj.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 4096 + ], + "size_in_bytes": 16384, + "op_tensor_size": 16384, + "offset": 5361934336, + "file_name": ".cache\\MatMulNBits_2_0_828.const", + "file_size": 16384 + }, + "model.layers.31.mlp.down_proj.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 458752 + ], + "size_in_bytes": 1835008, + "op_tensor_size": 1835008, + "offset": 5361950720, + "file_name": ".cache\\MatMulNBits_2_0_829.const", + "file_size": 1835008 + }, + "model.layers.31.mlp.down_proj.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 458752 + ], + "size_in_bytes": 458752, + "op_tensor_size": 458752, + "offset": 5363785728, + "file_name": ".cache\\MatMulNBits_2_0_830.const", + "file_size": 458752 + }, + "model.layers.32.final_norm_layernorm.weight.bf": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "bfloat16", + "shape": [ + 4096 + ], + "size_in_bytes": 8192, + "op_tensor_size": 8192, + "offset": 5364244480, + "file_name": ".cache\\MatMulNBits_2_0_831.const", + "file_size": 8192 + }, + "lm_head.MatMulNBits.qweight.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 4096, + 32768 + ], + "size_in_bytes": 134217728, + "op_tensor_size": 134217728, + "offset": 5364252672, + "file_name": ".cache\\MatMulNBits_2_0_832.const", + "file_size": 134217728 + }, + "lm_head.MatMulNBits.bias.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 32768 + ], + "size_in_bytes": 131072, + "op_tensor_size": 131072, + "offset": 5498470400, + "file_name": ".cache\\MatMulNBits_2_0_833.const", + "file_size": 131072 + }, + "lm_head.MatMulNBits.scales.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "float", + "shape": [ + 1048576 + ], + "size_in_bytes": 4194304, + "op_tensor_size": 4194304, + "offset": 5498601472, + "file_name": ".cache\\MatMulNBits_2_0_834.const", + "file_size": 4194304 + }, + "lm_head.MatMulNBits.qzeros.preformat": { + "packed_buffer_label": "const", + "xrt_arg_id": 3, + "dtype": "int8", + "shape": [ + 1048576 + ], + "size_in_bytes": 1048576, + "op_tensor_size": 1048576, + "offset": 5502795776, + "file_name": ".cache\\MatMulNBits_2_0_835.const", + "file_size": 1048576 + }, + "past_key_values.0.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 0 + }, + "past_key_values.0.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 8388608 + }, + "present.0.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 0 + }, + "present.0.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 8388608 + }, + "past_key_values.1.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 16777216 + }, + "past_key_values.1.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 25165824 + }, + "present.1.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 16777216 + }, + "present.1.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 25165824 + }, + "past_key_values.2.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 33554432 + }, + "past_key_values.2.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 41943040 + }, + "present.2.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 33554432 + }, + "present.2.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 41943040 + }, + "past_key_values.3.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 50331648 + }, + "past_key_values.3.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 58720256 + }, + "present.3.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 50331648 + }, + "present.3.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 58720256 + }, + "past_key_values.4.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 67108864 + }, + "past_key_values.4.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 75497472 + }, + "present.4.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 67108864 + }, + "present.4.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 75497472 + }, + "past_key_values.5.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 83886080 + }, + "past_key_values.5.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 92274688 + }, + "present.5.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 83886080 + }, + "present.5.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 92274688 + }, + "past_key_values.6.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 100663296 + }, + "past_key_values.6.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 109051904 + }, + "present.6.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 100663296 + }, + "present.6.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 109051904 + }, + "past_key_values.7.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 117440512 + }, + "past_key_values.7.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 125829120 + }, + "present.7.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 117440512 + }, + "present.7.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 125829120 + }, + "past_key_values.8.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 134217728 + }, + "past_key_values.8.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 142606336 + }, + "present.8.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 134217728 + }, + "present.8.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 142606336 + }, + "past_key_values.9.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 150994944 + }, + "past_key_values.9.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 159383552 + }, + "present.9.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 150994944 + }, + "present.9.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 159383552 + }, + "past_key_values.10.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 167772160 + }, + "past_key_values.10.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 176160768 + }, + "present.10.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 167772160 + }, + "present.10.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 176160768 + }, + "past_key_values.11.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 184549376 + }, + "past_key_values.11.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 192937984 + }, + "present.11.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 184549376 + }, + "present.11.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 192937984 + }, + "past_key_values.12.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 201326592 + }, + "past_key_values.12.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 209715200 + }, + "present.12.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 201326592 + }, + "present.12.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 209715200 + }, + "past_key_values.13.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 218103808 + }, + "past_key_values.13.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 226492416 + }, + "present.13.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 218103808 + }, + "present.13.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 226492416 + }, + "past_key_values.14.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 234881024 + }, + "past_key_values.14.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 243269632 + }, + "present.14.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 234881024 + }, + "present.14.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 243269632 + }, + "past_key_values.15.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 251658240 + }, + "past_key_values.15.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 260046848 + }, + "present.15.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 251658240 + }, + "present.15.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 260046848 + }, + "past_key_values.16.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 268435456 + }, + "past_key_values.16.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 276824064 + }, + "present.16.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 268435456 + }, + "present.16.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 276824064 + }, + "past_key_values.17.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 285212672 + }, + "past_key_values.17.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 293601280 + }, + "present.17.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 285212672 + }, + "present.17.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 293601280 + }, + "past_key_values.18.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 301989888 + }, + "past_key_values.18.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 310378496 + }, + "present.18.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 301989888 + }, + "present.18.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 310378496 + }, + "past_key_values.19.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 318767104 + }, + "past_key_values.19.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 327155712 + }, + "present.19.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 318767104 + }, + "present.19.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 327155712 + }, + "past_key_values.20.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 335544320 + }, + "past_key_values.20.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 343932928 + }, + "present.20.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 335544320 + }, + "present.20.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 343932928 + }, + "past_key_values.21.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 352321536 + }, + "past_key_values.21.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 360710144 + }, + "present.21.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 352321536 + }, + "present.21.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 360710144 + }, + "past_key_values.22.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 369098752 + }, + "past_key_values.22.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 377487360 + }, + "present.22.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 369098752 + }, + "present.22.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 377487360 + }, + "past_key_values.23.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 385875968 + }, + "past_key_values.23.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 394264576 + }, + "present.23.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 385875968 + }, + "present.23.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 394264576 + }, + "past_key_values.24.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 402653184 + }, + "past_key_values.24.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 411041792 + }, + "present.24.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 402653184 + }, + "present.24.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 411041792 + }, + "past_key_values.25.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 419430400 + }, + "past_key_values.25.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 427819008 + }, + "present.25.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 419430400 + }, + "present.25.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 427819008 + }, + "past_key_values.26.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 436207616 + }, + "past_key_values.26.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 444596224 + }, + "present.26.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 436207616 + }, + "present.26.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 444596224 + }, + "past_key_values.27.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 452984832 + }, + "past_key_values.27.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 461373440 + }, + "present.27.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 452984832 + }, + "present.27.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 461373440 + }, + "past_key_values.28.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 469762048 + }, + "past_key_values.28.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 478150656 + }, + "present.28.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 469762048 + }, + "present.28.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 478150656 + }, + "past_key_values.29.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 486539264 + }, + "past_key_values.29.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 494927872 + }, + "present.29.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 486539264 + }, + "present.29.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 494927872 + }, + "past_key_values.30.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 503316480 + }, + "past_key_values.30.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 511705088 + }, + "present.30.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 503316480 + }, + "present.30.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 511705088 + }, + "past_key_values.31.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 520093696 + }, + "past_key_values.31.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 528482304 + }, + "present.31.key": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 520093696 + }, + "present.31.value": { + "packed_buffer_label": "ext_buf_0", + "xrt_arg_id": 5, + "dtype": "bfloat16", + "shape": [ + 1, + 8, + 4096, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 528482304 + }, + "sin_cos_cache_token": { + "packed_buffer_label": "ext_buf_1", + "xrt_arg_id": 6, + "dtype": "bfloat16", + "shape": [ + 32768, + 128 + ], + "size_in_bytes": 8388608, + "op_tensor_size": 8388608, + "offset": 0 + } + }, + "aux_info": { + "is_llm": true + } +} \ No newline at end of file