Here are all the config used. But note that this was a while ago, so parameter names may have changed. 
deepspeed.yaml
compute_environment: LOCAL_MACHINE
deepspeed_config:
 deepspeed_multinode_launcher: standard
 deepspeed_config_file: ds_config.json
 zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
ds_config.json
{
    "bf16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": 1e6,
        "stage3_prefetch_bucket_size": 0.94e6,
        "stage3_param_persistence_threshold": 1e4,
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_fp16_weights_on_model_save": true
    },
    "train_batch_size": "auto",
    "steps_per_print": 2000,
    "wall_clock_breakdown": false
}
experiment_config.yaml
# Model arguments
model_name_or_path: Qwen/Qwen2.5-Math-72B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_config: default
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
bf16: true
use_vllm: true
vllm_mode: colocate
vllm_tensor_parallel_size: 8
vllm_gpu_memory_utilization: 0.5
vllm_enable_prefix_caching: false
vllm_max_model_len: 4096
do_eval: false
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
learning_rate: 3.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine
max_prompt_length: 512
max_completion_length: 3584
max_steps: -1
num_generations: 4
num_train_epochs: 1
overwrite_output_dir: true
per_device_train_batch_size: 4 
push_to_hub: false
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 1.0
eval_strategy: "no"
save_strategy: "steps"
save_steps: 30
save_total_limit: 3
report_to: 
- wandb
seed: 42
temperature: 0.7
warmup_ratio: 0.1