upload_1b_neox.py

07974b6 verified 4 months ago

1.57 kB

	{
	# Paths are relative to /lustre/fs0/scratch

	# Data etc.
	"data_path": "/shared/data/neox-dclm_baseline-100B-perturbed-25-50/standard_text_document",

	# or for weighted datasets:
	# "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
	# "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
	# "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
	# "train-data-weights": [1., 2.],
	# "test-data-weights": [2., 1.],
	# "valid-data-weights": [0.5, 0.4],

	# If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
	# WARNING: setting this to True will override any user provided weights
	# "weight_by_num_documents": false,
	# "weighted_sampler_alpha": 0.3,

	# Vocab
	"padded_vocab_size": 50304,
	"vocab_file": "/shared/ameyagod/HubbleSuite/vocab-data/olmo-0724-hf/tokenizer.json",
	"tokenizer_type": "HFTokenizer",

	"save": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-DYNA_25_50",
	"load": "/shared/pt_models/Hubble_1.1B/DCLM_100B/Perturbed-GBS_1024-SL_2048-DYNA_25_50",
	"checkpoint_validation_with_forward_pass": False,

	# "tensorboard_dir": "tensorboard",
	"log_dir": "logs",
	"use_wandb": True,
	"wandb_host": "https://api.wandb.ai",
	"wandb_team": "usc_and_mpi",
	"wandb_project": "Hubble",
	"wandb_run_name": "Hubble_1.1B-DCLM_100B-Perturbed-GBS_1024-SL_2048-DYNA_25_50",
	}