| { | |
| "args": { | |
| "adam_epsilon": 1e-08, | |
| "alpha_ce": 0.1, | |
| "alpha_distil": 0.9, | |
| "ampere_learning_rate": 0.01, | |
| "ampere_mask_init": "constant", | |
| "ampere_mask_scale": 0.0, | |
| "ampere_pruning_method": "disabled", | |
| "cache_dir": "", | |
| "config_name": "", | |
| "data_dir": "squad_data", | |
| "do_eval": true, | |
| "do_lower_case": true, | |
| "do_train": true, | |
| "doc_stride": 128, | |
| "eval_all_checkpoints": true, | |
| "eval_batch_size": 16, | |
| "evaluate_during_training": false, | |
| "final_ampere_temperature": 20, | |
| "final_lambda": 50, | |
| "final_shuffling_temperature": 20, | |
| "final_threshold": 0.1, | |
| "final_warmup": 10, | |
| "fp16": false, | |
| "fp16_opt_level": "O1", | |
| "global_topk": false, | |
| "global_topk_frequency_compute": 25, | |
| "gradient_accumulation_steps": 1, | |
| "in_shuffling_group": 4, | |
| "initial_ampere_temperature": 0.0, | |
| "initial_shuffling_temperature": 0.1, | |
| "initial_threshold": 0.0, | |
| "initial_warmup": 1, | |
| "lang_id": 0, | |
| "learning_rate": 3e-05, | |
| "local_rank": -1, | |
| "logging_steps": 500, | |
| "mask_block_cols": 32, | |
| "mask_block_rows": 32, | |
| "mask_init": "constant", | |
| "mask_scale": 0.0, | |
| "mask_scores_learning_rate": 0.01, | |
| "max_answer_length": 30, | |
| "max_grad_norm": 1.0, | |
| "max_query_length": 64, | |
| "max_seq_length": 384, | |
| "max_steps": -1, | |
| "model_name_or_path": "bert-base-uncased", | |
| "model_type": "masked_bert", | |
| "n_best_size": 20, | |
| "n_gpu": 1, | |
| "no_cuda": false, | |
| "null_score_diff_threshold": 0.0, | |
| "num_train_epochs": 20.0, | |
| "out_shuffling_group": 4, | |
| "overwrite_cache": false, | |
| "overwrite_output_dir": true, | |
| "per_gpu_eval_batch_size": 16, | |
| "per_gpu_train_batch_size": 16, | |
| "predict_file": "dev-v1.1.json", | |
| "pruning_method": "sigmoied_threshold", | |
| "pruning_submethod": "default", | |
| "regularization": "l1", | |
| "save_steps": 5000, | |
| "seed": 42, | |
| "server_ip": "", | |
| "server_port": "", | |
| "shuffling_learning_rate": 0.001, | |
| "shuffling_method": "disabled", | |
| "teacher_name_or_path": "csarron/bert-base-uncased-squad-v1", | |
| "teacher_type": "bert", | |
| "temperature": 2.0, | |
| "threads": 8, | |
| "tokenizer_name": "", | |
| "train_batch_size": 16, | |
| "train_file": "train-v1.1.json", | |
| "truncate_train_examples": -1, | |
| "verbose_logging": false, | |
| "version_2_with_negative": false, | |
| "warmup_steps": 5400, | |
| "weight_decay": 0.0 | |
| }, | |
| "config": { | |
| "_name_or_path": "bert-base-uncased", | |
| "ampere_mask_init": "constant", | |
| "ampere_mask_scale": 0.0, | |
| "ampere_pruning_method": "disabled", | |
| "architectures": ["MaskedBertForQuestionAnswering"], | |
| "attention_probs_dropout_prob": 0.1, | |
| "hidden_act": "gelu", | |
| "hidden_dropout_prob": 0.1, | |
| "hidden_size": 768, | |
| "in_shuffling_group": 4, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 3072, | |
| "layer_norm_eps": 1e-12, | |
| "mask_block_cols": 32, | |
| "mask_block_rows": 32, | |
| "mask_init": "constant", | |
| "mask_scale": 0.0, | |
| "max_position_embeddings": 512, | |
| "model_type": "masked_bert", | |
| "num_attention_heads": 12, | |
| "num_hidden_layers": 12, | |
| "out_shuffling_group": 4, | |
| "pad_token_id": 0, | |
| "pruning_method": "sigmoied_threshold", | |
| "pruning_submethod": "default", | |
| "shuffling_method": "disabled", | |
| "type_vocab_size": 2, | |
| "vocab_size": 30522 | |
| }, | |
| "packaging": { | |
| "model_name": "madlag/bert-base-uncased-squad1.1-block-sparse-0.20-v1", | |
| "model_owner": "madlag", | |
| "pytorch_final_file_size": 364810487 | |
| }, | |
| "performance": { | |
| "dense": { | |
| "eval_elapsed_time": 42.67642272700323 | |
| }, | |
| "pytorch_block_sparse": { | |
| "eval_elapsed_time": 30.794714744988596 | |
| }, | |
| "speedup": 1.3858359488115801 | |
| }, | |
| "precision": { | |
| "exact": 76.98202514648438, | |
| "f1": 85.4483871459961 | |
| }, | |
| "sparsity": { | |
| "ampere": false, | |
| "block_size": [32, 32], | |
| "block_sparse": true, | |
| "block_sparse_density": 0.2017264660493827, | |
| "block_sparse_nnz": 16732, | |
| "block_sparse_total": 82944, | |
| "global_density": 0.3807202378419934, | |
| "is_block_sparse_valid": true, | |
| "nnz_parameters": 41682690, | |
| "parameters": 109483778, | |
| "pruned_heads": { | |
| "0": [0, 2, 4, 5, 6, 7, 9, 11], | |
| "1": [0, 1, 2, 3, 5, 6, 7, 8, 9], | |
| "2": [1, 2, 3, 4, 5, 7, 8, 11], | |
| "3": [2, 3, 4, 6, 7, 10], | |
| "4": [0, 1, 2, 6, 7, 8, 10, 11], | |
| "5": [0, 1, 2, 4, 5, 6, 7, 11], | |
| "6": [0, 2, 3, 4, 6, 7, 10], | |
| "7": [1, 3, 6, 7, 11], | |
| "8": [0, 2, 3, 4, 5, 6, 7, 8], | |
| "9": [1, 3, 4, 5, 7, 9, 10], | |
| "10": [1, 4, 5, 6, 7, 8, 9], | |
| "11": [0, 2, 3, 5, 6, 7, 8, 10, 11] | |
| }, | |
| "total_attention_heads": 144, | |
| "total_pruned_attention_heads": 90 | |
| } | |
| } |