| { | |
| "_name_or_path": "/data5/models/Llama-3.1-Nemotron-70B-Instruct-HF", | |
| "architectures": [ | |
| "LlamaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 8192, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 28672, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 64, | |
| "num_hidden_layers": 80, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "quantization_config": { | |
| "amp": true, | |
| "autoround_version": "0.4.2.dev", | |
| "batch_size": 8, | |
| "bits": 4, | |
| "data_type": "int", | |
| "dataset": "NeelNanda/pile-10k", | |
| "enable_minmax_tuning": true, | |
| "enable_norm_bias_tuning": false, | |
| "enable_quanted_input": true, | |
| "gradient_accumulate_steps": 1, | |
| "group_size": 128, | |
| "iters": 200, | |
| "low_gpu_mem_usage": false, | |
| "lr": 0.005, | |
| "minmax_lr": 0.005, | |
| "nsamples": 128, | |
| "quant_method": "auto-round", | |
| "scale_dtype": "torch.float16", | |
| "seqlen": 2048, | |
| "sym": true, | |
| "to_quant_block_names": [ | |
| [ | |
| "model.layers.0", | |
| "model.layers.1", | |
| "model.layers.2", | |
| "model.layers.3", | |
| "model.layers.4", | |
| "model.layers.5", | |
| "model.layers.6", | |
| "model.layers.7", | |
| "model.layers.8", | |
| "model.layers.9", | |
| "model.layers.10", | |
| "model.layers.11", | |
| "model.layers.12", | |
| "model.layers.13", | |
| "model.layers.14", | |
| "model.layers.15", | |
| "model.layers.16", | |
| "model.layers.17", | |
| "model.layers.18", | |
| "model.layers.19", | |
| "model.layers.20", | |
| "model.layers.21", | |
| "model.layers.22", | |
| "model.layers.23", | |
| "model.layers.24", | |
| "model.layers.25", | |
| "model.layers.26", | |
| "model.layers.27", | |
| "model.layers.28", | |
| "model.layers.29", | |
| "model.layers.30", | |
| "model.layers.31", | |
| "model.layers.32", | |
| "model.layers.33", | |
| "model.layers.34", | |
| "model.layers.35", | |
| "model.layers.36", | |
| "model.layers.37", | |
| "model.layers.38", | |
| "model.layers.39", | |
| "model.layers.40", | |
| "model.layers.41", | |
| "model.layers.42", | |
| "model.layers.43", | |
| "model.layers.44", | |
| "model.layers.45", | |
| "model.layers.46", | |
| "model.layers.47", | |
| "model.layers.48", | |
| "model.layers.49", | |
| "model.layers.50", | |
| "model.layers.51", | |
| "model.layers.52", | |
| "model.layers.53", | |
| "model.layers.54", | |
| "model.layers.55", | |
| "model.layers.56", | |
| "model.layers.57", | |
| "model.layers.58", | |
| "model.layers.59", | |
| "model.layers.60", | |
| "model.layers.61", | |
| "model.layers.62", | |
| "model.layers.63", | |
| "model.layers.64", | |
| "model.layers.65", | |
| "model.layers.66", | |
| "model.layers.67", | |
| "model.layers.68", | |
| "model.layers.69", | |
| "model.layers.70", | |
| "model.layers.71", | |
| "model.layers.72", | |
| "model.layers.73", | |
| "model.layers.74", | |
| "model.layers.75", | |
| "model.layers.76", | |
| "model.layers.77", | |
| "model.layers.78", | |
| "model.layers.79" | |
| ] | |
| ] | |
| }, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 8.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "float16", | |
| "transformers_version": "4.46.3", | |
| "use_cache": true, | |
| "vocab_size": 128256 | |
| } | |