data_local: /scratch/playground/data/bin_v2old_data/ data_remote: # If blank, files must be present in data_local max_seq_len: 1024 tokenizer_name: tokenizer/camembertv2 mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance count_padding_tokens: false # Run Name run_name: camembertv2-base # Model model: name: flex_bert pretrained_model_name: configs/bert-base-uncased tokenizer_name: ${tokenizer_name} disable_train_metrics: true # FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object # Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure # the model_config settings match the architecture of the existing model model_config: vocab_size: 32768 init_method: full_megatron num_hidden_layers: 22 hidden_size: 768 intermediate_size: 1152 num_attention_heads: 12 # to have head size of 64 attention_layer: rope attention_probs_dropout_prob: 0.0 attn_out_bias: false attn_out_dropout_prob: 0.1 attn_qkv_bias: false bert_layer: prenorm embed_dropout_prob: 0.0 embed_norm: true final_norm: true skip_first_prenorm: true embedding_layer: sans_pos loss_function: fa_cross_entropy loss_kwargs: reduction: mean mlp_dropout_prob: 0.0 mlp_in_bias: false mlp_layer: glu mlp_out_bias: false normalization: layernorm norm_kwargs: eps: 1e-5 bias: false hidden_act: gelu head_pred_act: gelu activation_function: gelu # better safe than sorry padding: unpadded rotary_emb_dim: null rotary_emb_base: 10000.0 rotary_emb_scale_base: null rotary_emb_interleaved: false allow_embedding_resizing: true sliding_window: 128 global_attn_every_n_layers: 3 unpad_embeddings: true compile_model: true masked_prediction: true # Dataloaders train_loader: name: litdata_tokenized dataset: local: ${data_local} remote: ${data_remote} split: train tokenizer_name: ${tokenizer_name} max_seq_len: ${max_seq_len} shuffle: true mlm_probability: ${mlm_probability} streaming: false shuffle_seed: ${seed} seed: ${seed} drop_last: true num_workers: 12 sequence_packing: true # batch_size_warmup_min_size: 96 # batch_size_warmup_tokens: 50_000_000_000tok eval_loader: name: litdata_tokenized dataset: local: ${data_local} remote: ${data_remote} split: validation tokenizer_name: ${tokenizer_name} max_seq_len: ${max_seq_len} shuffle: false mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison streaming: false seed: ${seed} drop_last: false num_workers: 12 sequence_packing: false # Optimization scheduler: name: warmup_stable_decay t_warmup: 3_000_000_000tok alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration t_decay: 0tok optimizer: name: decoupled_adamw lr: 8e-4 # Peak learning rate betas: - 0.9 - 0.98 eps: 1.0e-06 weight_decay: 1.0e-5 # Amount of weight decay regularization filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases log_grad_norm: true max_duration: 1_000_000_000_000tok eval_interval: 5000ba global_train_batch_size: 4608 global_eval_batch_size: 6144 # System seed: 25 device_eval_batch_size: 128 device_train_microbatch_size: 96 precision: amp_bf16 # Logging progress_bar: true log_to_console: true console_log_interval: 100ba callbacks: runtime_estimator: {} dataloader_speed: {} speed_monitor: window_size: 100 lr_monitor: {} scheduled_gc: {} # log_grad_norm: # batch_log_interval: 100 packing_efficiency: log_interval: 100 loggers: # wandb: # project: fr_modernbert # entity: wissam tensorboard: log_dir: /scratch/playground/logs/tensorboard/ autoresume: true # Checkpoint to local filesystem or remote object store save_interval: 5000ba save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK save_folder: /scratch/playground/checkpoints/{run_name} # Load from local filesystem or remote object store to # load_path: null parallelism_config: fsdp: sharding_strategy: "FULL_SHARD" state_dict_type: "sharded" # mixed_precision: # param_dtype: bf16 # reduce_dtype: bf16 # buffer_dtype: bf16