NDR_patch_size: 16 accum_freq: 1 aug_cfg: {} batch_size: 1024 beta1: 0.9 beta2: 0.98 checkpoint_path: ./logs-lr1e-3-datacomp/clip_vit_b16_s512m_bs16k_mix0_8/checkpoints coca_caption_loss_weight: 2.0 coca_contrastive_loss_weight: 1.0 copy_codebase: False csv_caption_key: title csv_img_key: filepath csv_separator: dataset_resampled: False dataset_type: webdataset ddp_static_graph: True debug: False delete_prev_step_ckpt: True delete_previous_checkpoint: False device: cuda:0 dist_backend: nccl dist_url: env:// distill: False distill_model: None distill_pretrained: None distributed: True epochs: 4 epochs_cooldown: None eps: 1e-06 force_custom_text: False force_image_size: 224 force_patch_dropout: None force_quick_gelu: False gather_with_grad: True global_batch_size: 16384 grad_checkpointing: True grad_clip_norm: None horovod: False image_interpolation: None image_mean: None image_resize_mode: None image_std: None imagenet_v2: None imagenet_val: /mnt/bn/zilongdata-hl/dataset/imagenet/val is_cls_token: True local_loss: True local_rank: 0 lock_image: False lock_image_freeze_bn_stats: False lock_image_unlocked_groups: 0 lock_text: False lock_text_freeze_layer_norm: False lock_text_unlocked_layers: 0 log_every_n_steps: 128 log_level: 20 log_local: False log_path: ./logs-lr1e-3-datacomp/clip_vit_b16_s512m_bs16k_mix0_8/out.log logs: ./logs-lr1e-3-datacomp lr: 0.001 lr_cooldown_end: 0.0 lr_cooldown_power: 1.0 lr_scheduler: cosine max_seq_len: 15000 model: ViT-B-16 name: clip_vit_b16_s512m_bs16k_mix0_8 native_dynamic_resolution: False no_set_device_rank: False only_packing: False precision: amp pretrained: pretrained_image: pretrained_text: rank: 0 remote_sync: None remote_sync_frequency: 300 remote_sync_protocol: s3 report_to: wandb resume: None rope_attn_num_heads: 12 rope_model_width: 768 save_every_n_steps: 6104 save_frequency: 1 save_most_recent: False seed: 0 siglip: False skip_scheduler: False tensorboard: False tensorboard_path: torchcompile: False torchscript: False trace: False train_data: /mnt/bn/zilongdata-hl/dataset/Recap-DataComp-1B-Dataset/{000000..140146}.tar train_data_upsampling_factors: None train_num_samples: 128000000 use_bn_sync: False use_bnb_linear: None val_data: None val_frequency: 1 val_num_samples: None val_steps: 0 wandb: True wandb_notes: wandb_project_name: cls-clip-NDR warmup: 500 wd: 0.2 workers: 1 world_size: 16 zeroshot_frequency: 4 zeroshot_steps: 0