| { | |
| # Paths are relative to /lustre/fs0/scratch | |
| # Data etc. | |
| "data_path": "/shared/data/neox-dclm_baseline-100B-perturbed-25-50/standard_text_document", | |
| # or for weighted datasets: | |
| # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
| # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
| # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
| # "train-data-weights": [1., 2.], | |
| # "test-data-weights": [2., 1.], | |
| # "valid-data-weights": [0.5, 0.4], | |
| # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. | |
| # WARNING: setting this to True will override any user provided weights | |
| # "weight_by_num_documents": false, | |
| # "weighted_sampler_alpha": 0.3, | |
| # Vocab | |
| "padded_vocab_size": 50304, | |
| "vocab_file": "/shared/ameyagod/HubbleSuite/vocab-data/olmo-0724-hf/tokenizer.json", | |
| "tokenizer_type": "HFTokenizer", | |
| "save": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-DYNA_25_50", | |
| "load": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-DYNA_25_50", | |
| "checkpoint_validation_with_forward_pass": False, | |
| # "tensorboard_dir": "tensorboard", | |
| "log_dir": "logs", | |
| "use_wandb": True, | |
| "wandb_host": "https://api.wandb.ai", | |
| "wandb_team": "usc_and_mpi", | |
| "wandb_project": "Hubble", | |
| "wandb_run_name": "Hubble_1.1B-DCLM_100B-Perturbed-GBS_1024-SL_2048-DYNA_25_50", | |
| } | |