| { | |
| "force": true, | |
| "dump_path": "data/distilgpt2-e7", | |
| "student_type": "gpt2", | |
| "student_config": "distilgpt2-ja.json", | |
| "student_pretrained_weights": "data/distilgpt2-e6/checkpoint.pth", | |
| "teacher_type": "gpt2", | |
| "teacher_name": "rinna/japanese-gpt2-medium", | |
| "temperature": 2.0, | |
| "alpha_ce": 5.0, | |
| "alpha_mlm": 0.0, | |
| "alpha_clm": 0.5, | |
| "alpha_mse": 0.0, | |
| "alpha_cos": 1.0, | |
| "mlm": false, | |
| "mlm_mask_prop": 0.15, | |
| "word_mask": 0.8, | |
| "word_keep": 0.1, | |
| "word_rand": 0.1, | |
| "mlm_smoothing": 0.7, | |
| "restrict_ce_to_mask": false, | |
| "freeze_pos_embs": true, | |
| "freeze_token_type_embds": false, | |
| "n_epoch": 2, | |
| "batch_size": 16, | |
| "group_by_size": false, | |
| "gradient_accumulation_steps": 50, | |
| "warmup_prop": 0.0, | |
| "weight_decay": 0.0, | |
| "learning_rate": 4.9e-05, | |
| "adam_epsilon": 1e-06, | |
| "max_grad_norm": 5.0, | |
| "initializer_range": 0.02, | |
| "fp16": false, | |
| "fp16_opt_level": "O1", | |
| "n_gpu": 4, | |
| "local_rank": 0, | |
| "seed": 56, | |
| "log_interval": 500, | |
| "checkpoint_interval": 4000, | |
| "world_size": 4, | |
| "n_gpu_per_node": 4, | |
| "global_rank": 0, | |
| "n_nodes": 1, | |
| "node_id": 0, | |
| "multi_gpu": true, | |
| "is_master": true, | |
| "multi_node": false | |
| } |