data_local: /scratch/playground/data/bin_v2old_data/
data_remote: # If blank, files must be present in data_local

max_seq_len: 1024
tokenizer_name: tokenizer/camembertv2
mlm_probability: 0.3 # FlexBERT should use 30% masking for optimal performance
count_padding_tokens: false

# Run Name
run_name: camembertv2-base

# Model
model:
  name: flex_bert
  pretrained_model_name: configs/bert-base-uncased
  tokenizer_name: ${tokenizer_name}
  disable_train_metrics: true
  # FlexBERT 'base' generally uses the default architecture values from the Hugging Face BertConfig object
  # Note: if using the pretrained_checkpoint argument to create a model from an existing checkpoint, make sure
  # the model_config settings match the architecture of the existing model
  model_config:
    vocab_size: 32768
    init_method: full_megatron
    num_hidden_layers: 22
    hidden_size: 768
    intermediate_size: 1152
    num_attention_heads: 12 # to have head size of 64
    attention_layer: rope
    attention_probs_dropout_prob: 0.0
    attn_out_bias: false
    attn_out_dropout_prob: 0.1
    attn_qkv_bias: false
    bert_layer: prenorm
    embed_dropout_prob: 0.0
    embed_norm: true
    final_norm: true
    skip_first_prenorm: true
    embedding_layer: sans_pos
    loss_function: fa_cross_entropy
    loss_kwargs:
      reduction: mean
    mlp_dropout_prob: 0.0
    mlp_in_bias: false
    mlp_layer: glu
    mlp_out_bias: false
    normalization: layernorm
    norm_kwargs:
      eps: 1e-5
      bias: false
    hidden_act: gelu
    head_pred_act: gelu
    activation_function: gelu # better safe than sorry
    padding: unpadded
    rotary_emb_dim: null
    rotary_emb_base: 10000.0
    rotary_emb_scale_base: null
    rotary_emb_interleaved: false
    allow_embedding_resizing: true
    sliding_window: 128
    global_attn_every_n_layers: 3
    unpad_embeddings: true
    compile_model: true
    masked_prediction: true

# Dataloaders
train_loader:
  name: litdata_tokenized
  dataset:
    local: ${data_local}
    remote: ${data_remote}
    split: train
    tokenizer_name: ${tokenizer_name}
    max_seq_len: ${max_seq_len}
    shuffle: true
    mlm_probability: ${mlm_probability}
    streaming: false
    shuffle_seed: ${seed}
    seed: ${seed}
  drop_last: true
  num_workers: 12
  sequence_packing: true
  # batch_size_warmup_min_size: 96
  # batch_size_warmup_tokens: 50_000_000_000tok

eval_loader:
  name: litdata_tokenized
  dataset:
    local: ${data_local}
    remote: ${data_remote}
    split: validation
    tokenizer_name: ${tokenizer_name}
    max_seq_len: ${max_seq_len}
    shuffle: false
    mlm_probability: 0.15 # We always evaluate at 15% masking for consistent comparison
    streaming: false
    seed: ${seed}
  drop_last: false
  num_workers: 12
  sequence_packing: false

# Optimization
scheduler:
  name: warmup_stable_decay
  t_warmup: 3_000_000_000tok
  alpha_f: 0.00 # Linearly decay to 0.02x the full LR by the end of the training duration
  t_decay: 0tok

optimizer:
  name: decoupled_adamw
  lr: 8e-4 # Peak learning rate
  betas:
    - 0.9
    - 0.98
  eps: 1.0e-06
  weight_decay: 1.0e-5 # Amount of weight decay regularization
  filter_bias_norm_wd: true # If True, doesn't apply weight decay to norm layers and biases
  log_grad_norm: true

max_duration: 1_000_000_000_000tok
eval_interval: 5000ba
global_train_batch_size: 4608
global_eval_batch_size: 6144

# System
seed: 25
device_eval_batch_size: 128
device_train_microbatch_size: 96
precision: amp_bf16

# Logging
progress_bar: true
log_to_console: true
console_log_interval: 100ba

callbacks:
  runtime_estimator: {}
  dataloader_speed: {}
  speed_monitor:
    window_size: 100
  lr_monitor: {}
  scheduled_gc: {}
  # log_grad_norm:
  #   batch_log_interval: 100
  packing_efficiency:
    log_interval: 100

loggers:
  # wandb:
  #   project: fr_modernbert
  #   entity: wissam
  tensorboard:
    log_dir: /scratch/playground/logs/tensorboard/

autoresume: true
# Checkpoint to local filesystem or remote object store
save_interval: 5000ba
save_num_checkpoints_to_keep: -1 # Important, this cleans up checkpoints saved to DISK
save_folder: /scratch/playground/checkpoints/{run_name}
# Load from local filesystem or remote object store to
# load_path: null

parallelism_config:
  fsdp:
    sharding_strategy: "FULL_SHARD"
    state_dict_type: "sharded"
# mixed_precision:
#   param_dtype: bf16
#   reduce_dtype: bf16
#   buffer_dtype: bf16