Spaces:

nvidia
/

audio-flamingo-2

Runtime error

App Files Files Community

anon-sub-openreview commited on Jun 25

Commit

79a27a2

1 Parent(s): 01dd58e

updated ckpt

Browse files

Files changed (21) hide show

configs/inference.yaml +9 -202
configs/inference_1.5.yaml +0 -302
configs/inference_2.yaml +0 -302
configs/inference_long.yaml +0 -284
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -255
configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -183
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml +0 -483
configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml +0 -284
data/__pycache__/data.cpython-38.pyc +0 -0
data/data.py +1 -1
my_laion_clap/CLAP/src/laion_clap/clap_module/model.py +1 -1
my_ms_clap/src/configs/config_2022.yml +1 -1
my_ms_clap/src/models/config.py +1 -1
src/__pycache__/__init__.cpython-38.pyc +0 -0
src/__pycache__/factory.cpython-38.pyc +0 -0
src/__pycache__/flamingo.cpython-38.pyc +0 -0
src/__pycache__/flamingo_lm.cpython-38.pyc +0 -0
src/__pycache__/helpers.cpython-38.pyc +0 -0
src/__pycache__/utils.cpython-38.pyc +0 -0
src/factory.py +2 -2
src/helpers.py +3 -3

configs/inference.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-3
   delete_previous_checkpoint: true
   batch_size: 8
   gradient_accumulation_steps: 2
@@ -24,225 +24,32 @@ train_config:
   fsdp_sharding_strategy: full  # full, hybrid
   horovod: false
-# instruction tuning hparams
-# sft_config:
-#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
-#   pretrained_ckpt: checkpoint_199.pt
-#   unfreeze_full_lm: false
 data_config:
   dataset_blending_global_weight: 0.005
   dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 1.0
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
       weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-    FMA-GenreClassification/train:
-      weight: 0.5
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
   max_tokens: 512
   num_workers: 4
   valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
 clap_config:
   method: nvclap-large
   audio_embed_dim: 2048
-  checkpoint: clap_ckpt/epoch_15.pt
   window_length: 10.0  # seconds
   window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
   max_num_fewshot: 1  # number of fewshot samples (including the final one)
   finetune: true

 train_config:
+  expdir: ./af2_e3b
+  run_name: run_1
   delete_previous_checkpoint: true
   batch_size: 8
   gradient_accumulation_steps: 2
   fsdp_sharding_strategy: full  # full, hybrid
   horovod: false
 data_config:
   dataset_blending_global_weight: 0.005
   dataset_blending_config:
+    dataset/train:
       weight: 1.0
+  dataset_file_root: ./data
+  data_root: ./datasets
+  dataset_blending_output: ./dataset_blending.json
   max_tokens: 512
   num_workers: 4
   valid_dataset_config:
+    dataset/test: true
 clap_config:
   method: nvclap-large
   audio_embed_dim: 2048
+  checkpoint: clap_ckpt/epoch_16.pt
   window_length: 10.0  # seconds
   window_overlap: 0.0  # seconds
+  max_num_window: 12  # 2 minutes
   max_num_fewshot: 1  # number of fewshot samples (including the final one)
   finetune: true

configs/inference_1.5.yaml DELETED Viewed

@@ -1,302 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-1.5b
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-1.5B-sft
-  delete_previous_checkpoint: true
-  batch_size: 32
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-# sft_config:
-#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
-#   pretrained_ckpt: checkpoint_199.pt
-#   unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 0.8
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    UrbanSound8K-EventClassification/train:
-      weight: 0.5
-    TUT-EventClassification/train:
-      weight: 2.0
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    Medley-solos-DB-InstrClassification/train:
-      weight: 1.5
-    GTZAN-GenreClassification/train:
-      weight: 2.0
-    NSynth-MIR/train:
-      weight: 0.4
-    NSynth-Instrument/train:
-      weight: 1.5
-    NSynth-Source/train:
-      weight: 1.5
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-    FMA-GenreClassification/train:
-      weight: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 1.0
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-1.5B
-  tokenizer_path: Qwen/Qwen2.5-1.5B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/inference_2.yaml DELETED Viewed

@@ -1,302 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-7b-fixed/
-  pretrained_ckpt: checkpoint_199.pt
-  unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 0.8
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    UrbanSound8K-EventClassification/train:
-      weight: 0.5
-    TUT-EventClassification/train:
-      weight: 2.0
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    Medley-solos-DB-InstrClassification/train:
-      weight: 1.5
-    GTZAN-GenreClassification/train:
-      weight: 2.0
-    NSynth-MIR/train:
-      weight: 0.4
-    NSynth-Instrument/train:
-      weight: 1.5
-    NSynth-Source/train:
-      weight: 1.5
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-    FMA-GenreClassification/train:
-      weight: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 1.0
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/inference_long.yaml DELETED Viewed

@@ -1,284 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed-sft
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft-long
-  delete_previous_checkpoint: true
-  batch_size: 2
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-# sft_config:
-#   pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
-#   pretrained_ckpt: checkpoint_199.pt
-#   unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 1.0
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-    FMA-GenreClassification/train:
-      weight: 0.5
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 30  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED Viewed

@@ -1,255 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
-  delete_previous_checkpoint: true
-  batch_size: 6
-  gradient_accumulation_steps: 2  # 4 nodes
-  seed: 42
-  learning_rate: 0.0001
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 100  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: true
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-data_config:
-  dataset_blending_global_weight: 0.01
-  dataset_blending_config:
-    # Audio QA
-    OpenAQA-AQA/train:
-      weight: 1.0
-      prefix_prob: 0.0
-      augmentations:
-        do_nothing: 1.0
-    # Audio Captioning
-    BBCSoundEffects-AudioDescription/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    CLAP_freesound-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    SoundDescs-AudioDescription/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-AudioSet_SL-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
-      weight: 2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-FreeSound-AudioCaptioning/train:
-      weight: 2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    WavCaps-SoundBible-AudioCaptioning/train:
-      weight: 5
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Audio Classification
-    AudioSetFullwoAudioMusicCaps-EventClassification/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    AudioSet-EventClassification/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    Clotho-AQA-EventClassification/train:
-      weight: 5.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    WavText5K-Tagging/train:
-      weight: 3.0
-      prefix_prob: 0.5
-      augmentations:
-        num_words: 0.8
-        do_nothing: 0.2
-    # Speech Emotion Classification
-    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-EmotionClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-EmotionClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-SentimentClassification/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    MELD-SentimentClassification/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        provide_all_labels: 0.9
-        do_nothing: 0.1
-    # Music QA
-    Music-AVQA-AVQA_All/train:
-      weight: 3.0
-      prefix_prob: 0.5
-      augmentations:
-        AQA_binary_instruction: 1.0
-    MU-LLAMA-AQA/train:
-      weight: 1.8
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    MU-LLAMA-AQA/interleaved_knn-train:
-      weight: 1.2
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.6
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    NSynth-MIR/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-      prefix_prob: 0.5
-      augmentations:
-        do_nothing: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    CLAP_freesound-AudioCaptioning/test: true
-    SoundDescs-AudioDescription/test: true
-    Clotho-AQA-EventClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/interleaved_knn-test: true
-    MELD-EmotionClassification/test: true
-    MELD-EmotionClassification/interleaved_knn-test: true
-    MELD-SentimentClassification/test: true
-    MELD-SentimentClassification/interleaved_knn-test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/val: true
-    NSynth-MIR/test: true
-    NSynth-MIR/interleaved_knn-test: true
-    mtg-jamendo-MusicTagging/val: true
-clap_config:
-  # method: laion-clap
-  # audio_embed_dim: 512
-  # model_name: 630k-fusion-best
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
-  method: microsoft-clap
-  audio_embed_dim: 1024
-  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
-  # model_name: '2023'
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
-  model_name: 'clapcap'
-  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
-  window_length: 7.0  # seconds
-  window_overlap: 5.25  # seconds
-  max_num_window: 16  # 35 seconds
-  max_num_fewshot: 4  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
-  lang_encoder_path: facebook/opt-iml-max-1.3b
-  tokenizer_path: facebook/opt-iml-max-1.3b
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 16,  # must = max_num_window
-  }

configs/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED Viewed

@@ -1,183 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2  # 4 nodes
-  seed: 42
-  learning_rate: 0.0001
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    # Audio QA
-    OpenAQA-AQA/train:
-      weight: 1.0
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 2.0
-    CompA-R-AQA/train:
-      weight: 2.0
-    # Audio Captioning
-    BBCSoundEffects-AudioDescription/train:
-      weight: 5.0
-    CLAP_freesound-AudioCaptioning/train:
-      weight: 1.0
-    SoundDescs-AudioDescription/train:
-      weight: 1.0
-    WavCaps-AudioSet_SL-AudioCaptioning/train:
-      weight: 1.0
-    WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
-      weight: 2.0
-    WavCaps-FreeSound-AudioCaptioning/train:
-      weight: 2.0
-    WavCaps-SoundBible-AudioCaptioning/train:
-      weight: 5.0
-    Ego-10-AudioCaptioning/train:
-      weight: 2.0
-    Ego-30-AudioCaptioning/train:
-      weight: 2.0
-    # Audio Classification
-    AudioSetFullwoAudioMusicCaps-EventClassification/train:
-      weight: 1.0
-    AudioSet-EventClassification/train:
-      weight: 5.0
-    Clotho-AQA-EventClassification/train:
-      weight: 5.0
-    WavText5K-Tagging/train:
-      weight: 3.0
-    # Speech Emotion Classification
-    MSP-PODCAST-Publish-1.9-EmotionClassification/train:
-      weight: 3.0
-    MELD-EmotionClassification/train:
-      weight: 3.0
-    MELD-SentimentClassification/train:
-      weight: 3.0
-    # Music QA
-    Music-AVQA-AVQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 3.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    CLAP_freesound-AudioCaptioning/test: true
-    SoundDescs-AudioDescription/test: true
-    Clotho-AQA-EventClassification/test: true
-    MSP-PODCAST-Publish-1.9-EmotionClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/val: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 3  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node.yaml DELETED Viewed

@@ -1,483 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation
-  run_name: run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 1
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: fp32  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 160  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/v1.0_optimlmax1.3b_foundation/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_ICL4x16win-4node/
-  pretrained_ckpt: checkpoint_99.pt
-  unfreeze_full_lm: true
-data_config:
-  dataset_blending_global_weight: 0.01
-  dataset_blending_config:
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    Clotho-AQA-AQA/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Clotho-v2-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    audiocaps-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    Epidemic_sound-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    MACS-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_short: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    FSD50k-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    CochlScene-SceneClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    NonSpeech7k-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    NonSpeech7k-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    chime-home-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    chime-home-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    SONYC-UST-EventClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    SONYC-UST-EventClassification/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        default: 0.5
-        num_words: 0.5
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.1
-        default: 0.9
-    emov-db-EmotionClassification/train:
-      weight: 1.6
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    emov-db-EmotionClassification/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    jl-corpus-EmotionClassification/interleaved_knn-train:
-      weight: 1.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    tess-EmotionClassification/train:
-      weight: 2.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    tess-EmotionClassification/interleaved_knn-train:
-      weight: 0.5
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 2.4
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    IEMOCAP-EmotionClassification/interleaved_knn-train:
-      weight: 0.6
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    VocalSound-VocalClassification/train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 2.0
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    Music-AVQA-AQA_All/interleaved_knn-train:
-      weight: 1.0
-      prefix_prob: 1.0
-      augmentations:
-        AQA_binary_instruction: 1.0
-    MU-LLAMA-AQA/train:
-      weight: 0.9
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    MU-LLAMA-AQA/interleaved_knn-train:
-      weight: 0.1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.05  # 1.3M
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MSD-AudioCaptioning/interleaved_knn-train:
-      weight: 0.05  # 111k
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 1.6
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MC-AudioCaptioning/interleaved_knn-train:
-      weight: 0.4
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    MusicCaps-AudioCaptioning/interleaved_knn-train:
-      weight: 1.5
-      prefix_prob: 1.0
-      augmentations:
-        AC_paragraph: 1.0
-    SongDescriber-AudioCaptioning/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    SongDescriber-AudioCaptioning/interleaved_knn-train:
-      weight: 0.2
-      prefix_prob: 1.0
-      augmentations:
-        AC_long: 1.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2  # 289k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    NSynth-MIR/interleaved_knn-train:
-      weight: 0.2  # 60k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-      prefix_prob: 1.0
-      augmentations:
-        default: 1.0
-    FMA-GenreClassification/train:
-      weight: 0.4  # 104k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    FMA-GenreClassification/interleaved_knn-train:
-      weight: 0.3  # 46k for weight = 1
-      prefix_prob: 1.0
-      augmentations:
-        do_nothing: 1.0
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 0.5
-        default: 0.5
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/dataset_files
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-AQA-AQA/interleaved_knn-test: true
-    Clotho-v2-AudioCaptioning/test: true
-    Clotho-v2-AudioCaptioning/interleaved_knn-test: true
-    FSD50k-EventClassification/test: true
-    FSD50k-EventClassification/interleaved_knn-test: true
-    CochlScene-SceneClassification/test: true
-    CochlScene-SceneClassification/interleaved_knn-test: true
-    NonSpeech7k-EventClassification/test: true
-    NonSpeech7k-EventClassification/interleaved_knn-test: true
-    SONYC-UST-EventClassification/test: true
-    SONYC-UST-EventClassification/interleaved_knn-test: true
-    emov-db-EmotionClassification/val: true
-    emov-db-EmotionClassification/interleaved_knn-val: true
-    jl-corpus-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/interleaved_knn-val: true
-    tess-EmotionClassification/val: true
-    tess-EmotionClassification/interleaved_knn-val: true
-    IEMOCAP-EmotionClassification/test: true
-    IEMOCAP-EmotionClassification/interleaved_knn-test: true
-    OMGEmotion-EmotionClassification/val: true
-    Music-AVQA-AQA_All/test: true
-    Music-AVQA-AQA_All/interleaved_knn-test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/interleaved_knn-test: true
-    NSynth-MIR/test: true
-    NSynth-MIR/interleaved_knn-test: true
-    mtg-jamendo-MusicTagging/val: true
-    audiocaps-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/interleaved_knn-test: true
-    MusicCaps-AudioCaptioning/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    VocalSound-VocalClassification/test: true
-    musdbhq-InstrClassification/test: true
-    # zero shot
-    GTZAN-GenreClassification/train:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    GTZAN-GenreClassification/interleaved_knn-train:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    Medley-solos-DB-InstrClassification/test:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-    Medley-solos-DB-InstrClassification/interleaved_knn-test:
-      prefix_prob: 1.0
-      augmentations:
-        provide_all_labels: 1.0
-clap_config:
-  # method: laion-clap
-  # audio_embed_dim: 512
-  # model_name: 630k-fusion-best
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
-  method: microsoft-clap
-  audio_embed_dim: 1024
-  config_root: /home/zkong/audio_flamingo/audio_flamingo_v1/v1.0_optimlmax1.3b_foundation/my_ms_clap/src/configs
-  # model_name: '2023'
-  # checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/CLAP_weights_2023.pth
-  model_name: 'clapcap'
-  checkpoint: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/clap/clapcap_weights_2023.pth
-  window_length: 7.0  # seconds
-  window_overlap: 5.25  # seconds
-  max_num_window: 16  # 35 seconds
-  max_num_fewshot: 4  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/zkong/audio-flamingo-data/LLM_pretrained/.cache
-  lang_encoder_path: facebook/opt-iml-max-1.3b
-  tokenizer_path: facebook/opt-iml-max-1.3b
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 16,  # must = max_num_window
-  }

configs/run_demo_sft_fp32_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node.yaml DELETED Viewed

@@ -1,284 +0,0 @@
-train_config:
-  expdir: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed
-  run_name: run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed-sft
-  delete_previous_checkpoint: true
-  batch_size: 4
-  gradient_accumulation_steps: 2
-  seed: 42
-  learning_rate: 0.00002
-  lr_scheduler: constant
-  loss_multiplier: 1.0
-  warmup_steps: 1875
-  weight_decay: 0.1
-  precision: amp_bf16  # ["amp_bf16", "amp_bfloat16", "bf16", "fp16", "fp32"]
-  gradient_checkpointing: False
-  num_epochs: 200  # num_epochs * dataset_blending_global_weight = 1
-  offline: false
-  freeze_lm_embeddings: false
-  logging_steps: 10
-  dist_backend: nccl
-  dist_url: env:// # tcp://localhost:7000
-  no_set_device_rank: false
-  fsdp: true
-  fsdp_use_orig_params: false  # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
-  fsdp_sharding_strategy: full  # full, hybrid
-  horovod: false
-# instruction tuning hparams
-sft_config:
-  pretrained_path: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/run_demo_pretraining_bf16_xattnevery1_msclapcap_win7_ovlp5.25_single16win-4node-qwen3b-rotary-3b-fixed_ckpt_stage1/
-  pretrained_ckpt: checkpoint_199.pt
-  unfreeze_full_lm: false
-data_config:
-  dataset_blending_global_weight: 0.005
-  dataset_blending_config:
-    MMAUQA/train:
-      weight: 1.5
-    AudioSet-Temporal-Speech-Audio-QA/train:
-      weight: 1.0
-    CompA-R-AQA/train:
-      weight: 1.0
-    # Audio QA
-    Clotho-AQA-AQA/train:
-      weight: 1.0
-    OpenAQA-AQA/train:
-      weight: 1.0
-    SalmonnQA/train:
-      weight: 1.0
-    AudioEntailmentQA/train:
-      weight: 1.0
-    # Audio Captioning
-    Clotho-v2-AudioCaptioning/train:
-      weight: 1.0
-    audiocaps-AudioCaptioning/train:
-      weight: 1.0
-    Epidemic_sound-AudioCaptioning/train:
-      weight: 1.0
-    MACS-AudioCaptioning/train:
-      weight: 1.0
-    # Audio Classification
-    FSD50k-EventClassification/train:
-      weight: 1.0
-    CochlScene-SceneClassification/train:
-      weight: 1.0
-    NonSpeech7k-EventClassification/train:
-      weight: 1.0
-    chime-home-EventClassification/train:
-      weight: 1.0
-    SONYC-UST-EventClassification/train:
-      weight: 1.0
-    # Speech Emotion Classification
-    MELD-EmotionClassification/train:
-      weight: 0.5
-    MELD-SentimentClassification/train:
-      weight: 0.5
-    emov-db-EmotionClassification/train:
-      weight: 1.0
-    jl-corpus-EmotionClassification/train:
-      weight: 6.0
-    tess-EmotionClassification/train:
-      weight: 2.5
-    IEMOCAP-EmotionClassification/train:
-      weight: 3.0
-    OMGEmotion-EmotionClassification/train:
-      weight: 3.0
-    VocalSound-VocalClassification/train:
-      weight: 1.5
-    # Music QA
-    Music-AVQA-AQA_All/train:
-      weight: 3.0
-    MU-LLAMA-AQA/train:
-      weight: 1.0
-    # Music Captioning
-    LP-MusicCaps-MSD-AudioCaptioning/train:
-      weight: 0.06
-    LP-MusicCaps-MC-AudioCaptioning/train:
-      weight: 2.0
-    LP-MusicCaps-MTT-AudioCaptioning/train:
-      weight: 1.0
-    MusicCaps-AudioCaptioning/train:
-      weight: 6.0
-    musdbhq-captioning/train:
-      weight: 2.0
-    # Music Understanding
-    NSynth-MIR/train:
-      weight: 0.2
-    mtg-jamendo-MusicTagging/train:
-      weight: 0.1
-    FMA-GenreClassification/train:
-      weight: 0.5
-    musdbhq-InstrClassification/train:
-      weight: 0.8
-    LLARK_FMA-mir/train:
-      weight: 1.0
-    LLARK_FMA-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-mir/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-reasoning/train:
-      weight: 1.0
-    LLARK_MagnaTagATune-reasoning/train:
-      weight: 1.0
-    LLARK_MTG-Jamendo-mir/train:
-      weight: 1.0
-    MusicBenchQA/train:
-      weight: 1.0
-  dataset_file_root: /lustre/fsw/portfolios/adlr/users/sreyang/final_qa/foundational_data_w_duration
-  data_root: /lustre/fsw/portfolios/adlr/users/zkong/datasets
-  dataset_blending_output: /lustre/fsw/portfolios/adlr/users/sreyang/flamingo_v2/af2_exp_qwen3b_rotary_all_layers-7b-fixed/dataset_blending.json
-  max_tokens: 512
-  num_workers: 4
-  valid_dataset_config:
-    Clotho-AQA-AQA/test: true
-    Clotho-v2-AudioCaptioning/test: true
-    audiocaps-AudioCaptioning/test: true
-    FSD50k-EventClassification/test: true
-    CochlScene-SceneClassification/test: true
-    NonSpeech7k-EventClassification/test: true
-    SONYC-UST-EventClassification/test: true
-    MELD-EmotionClassification/test: true
-    MELD-SentimentClassification/test: true
-    emov-db-EmotionClassification/val: true
-    jl-corpus-EmotionClassification/val: true
-    tess-EmotionClassification/val: true
-    IEMOCAP-EmotionClassification/val: true
-    OMGEmotion-EmotionClassification/val: true
-    VocalSound-VocalClassification/test: true
-    Music-AVQA-AQA_All/test: true
-    MU-LLAMA-AQA/test: true
-    LP-MusicCaps-MSD-AudioCaptioning/test: true
-    LP-MusicCaps-MC-AudioCaptioning/test: true
-    LP-MusicCaps-MTT-AudioCaptioning/test: true
-    MusicCaps-AudioCaptioning/test: true
-    NSynth-MIR/test: true
-    mtg-jamendo-MusicTagging/val: true
-    musdbhq-InstrClassification/test: true
-    # # zero shot
-    # CREMA-D-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # ravdess-EmotionClassification/train:
-    #   prefix_prob: 1.0
-    # UrbanSound8K-EventClassification/train:
-    #   prefix_prob: 1.0
-    # ESC50-EventClassification/train:
-    #   prefix_prob: 1.0
-    # DCASE17Task4-SceneClassification/test:
-    #   prefix_prob: 1.0
-    # GTZAN-GenreClassification/train:
-    #   prefix_prob: 1.0
-    # Medley-solos-DB-InstrClassification/test:
-    #   prefix_prob: 1.0
-clap_config:
-  method: nvclap-large
-  audio_embed_dim: 2048
-  checkpoint: /lustre/fsw/portfolios/adlr/users/sreyang/datasets/clap_datasets/clap_ckpts_5/15/ck_sim/checkpoints/epoch_15.pt
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 9  # 1.5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-  finetune: true
-whisper_config:
-  method: whisper-large-v3
-  path: openai/whisper-large-v3
-  audio_embed_dim: 1280
-  sampling_rate: 16000
-  window_length: 30.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-mert_config:
-  method: mert-v1
-  path: m-a-p/MERT-v1-330M
-  audio_embed_dim: 1024
-  sampling_rate: 24000
-  window_length: 10.0  # seconds
-  window_overlap: 0.0  # seconds
-  max_num_window: 1  # 5 minutes
-  max_num_fewshot: 1  # number of fewshot samples (including the final one)
-model_config:
-  cache_dir: /lustre/fsw/portfolios/adlr/users/sreyang/.cache
-  lang_encoder_path: Qwen/Qwen2.5-3B
-  tokenizer_path: Qwen/Qwen2.5-3B
-  cross_attn_every_n_layers: 1
-  audio_transformer_kwargs: {
-    n_head: 8,
-    n_layers: 3,
-    d_inner: 2048,
-    max_num_media: 128,  # must be >= max_num_window * num_fewshot_samples (4)
-    max_window_per_audio: 1,  # must = max_num_window
-    common_encoder_embed_dim: 1024
-  }

data/__pycache__/data.cpython-38.pyc DELETED Viewed

Binary file (16.4 kB)

data/data.py CHANGED Viewed

@@ -92,7 +92,7 @@ class DataCollator:
                 audio_clip = torch.cat(this_audio_clip_clips)
                 audio_embed_mask = torch.zeros(max_window_batch)
                 audio_embed_mask[:num_windows] = 1
-            elif num_windows < max_window_batch:
                 audio_clip = this_audio_clip_clips[:max_window_batch]
                 audio_clip = torch.cat(this_audio_clip_clips)
                 audio_embed_mask = audio_embed_mask[:max_window_batch]

                 audio_clip = torch.cat(this_audio_clip_clips)
                 audio_embed_mask = torch.zeros(max_window_batch)
                 audio_embed_mask[:num_windows] = 1
+            elif num_windows > max_window_batch:
                 audio_clip = this_audio_clip_clips[:max_window_batch]
                 audio_clip = torch.cat(this_audio_clip_clips)
                 audio_embed_mask = audio_embed_mask[:max_window_batch]

my_laion_clap/CLAP/src/laion_clap/clap_module/model.py CHANGED Viewed

@@ -403,7 +403,7 @@ class CLAPAudioCfp:
     window_size: int = 1024
     hop_size: int = 1024
     fmin: int = 50
-    fmax: int = 14000
     class_num: int = 527
     mel_bins: int = 64
     clip_samples: int = 480000

     window_size: int = 1024
     hop_size: int = 1024
     fmin: int = 50
+    fmax: int = 8000
     class_num: int = 527
     mel_bins: int = 64
     clip_samples: int = 480000

my_ms_clap/src/configs/config_2022.yml CHANGED Viewed

@@ -10,7 +10,7 @@ out_emb: 2048
 sampling_rate: 44100
 duration: 5
 fmin: 50
-fmax: 14000
 n_fft: 1028
 hop_size: 320
 mel_bins: 64

 sampling_rate: 44100
 duration: 5
 fmin: 50
+fmax: 8000
 n_fft: 1028
 hop_size: 320
 mel_bins: 64

my_ms_clap/src/models/config.py CHANGED Viewed

@@ -56,7 +56,7 @@ window_size = 1024
 hop_size = 320 # 160 for scv2, 320 for audioset and esc-50
 mel_bins = 64
 fmin = 50
-fmax = 14000
 shift_max = int(clip_samples * 0.5)
 # for data collection

 hop_size = 320 # 160 for scv2, 320 for audioset and esc-50
 mel_bins = 64
 fmin = 50
+fmax = 8000
 shift_max = int(clip_samples * 0.5)
 # for data collection

src/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (180 Bytes)

src/__pycache__/factory.cpython-38.pyc DELETED Viewed

Binary file (14.5 kB)

src/__pycache__/flamingo.cpython-38.pyc DELETED Viewed

Binary file (7.17 kB)

src/__pycache__/flamingo_lm.cpython-38.pyc DELETED Viewed

Binary file (5.79 kB)

src/__pycache__/helpers.cpython-38.pyc DELETED Viewed

Binary file (12.9 kB)

src/__pycache__/utils.cpython-38.pyc DELETED Viewed

Binary file (1.46 kB)

src/factory.py CHANGED Viewed

@@ -47,7 +47,7 @@ class CLAPAudioCfp:
     window_size: int = 1024
     hop_size: int = 160
     fmin: int = 50
-    fmax: int = 14000
     class_num: int = 527
     mel_bins: int = 64
     clip_samples: int = 160000
@@ -108,7 +108,7 @@ class CLAP(nn.Module):
             onesided=True,
             n_mels=64,
             f_min=50,
-            f_max=14000
         ).to(audio_data.device)
         mel = mel_tf(audio_data)

     window_size: int = 1024
     hop_size: int = 160
     fmin: int = 50
+    fmax: int = 8000
     class_num: int = 527
     mel_bins: int = 64
     clip_samples: int = 160000
             onesided=True,
             n_mels=64,
             f_min=50,
+            f_max=8000
         ).to(audio_data.device)
         mel = mel_tf(audio_data)

src/helpers.py CHANGED Viewed

@@ -85,13 +85,13 @@ class MultiHeadAttention(nn.Module):
         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
         # Apply rotary positional embeddings
         q = apply_rotary_pos_emb(q, rotary_frequencies)
         k = apply_rotary_pos_emb(k, rotary_frequencies)
-        # Transpose for attention dot product: b x n x lq x dv
-        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
         if mask is not None:
             mask = mask.unsqueeze(1).unsqueeze(2)   # For head axis broadcasting.

         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
         # Apply rotary positional embeddings
         q = apply_rotary_pos_emb(q, rotary_frequencies)
         k = apply_rotary_pos_emb(k, rotary_frequencies)
         if mask is not None:
             mask = mask.unsqueeze(1).unsqueeze(2)   # For head axis broadcasting.