Shikhar Bharadwaj commited on
Commit
67872bb
·
1 Parent(s): 282cf02

Update model

Browse files
README.md ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - classification
6
+ datasets:
7
+ - fsd50k
8
+ license: cc-by-4.0
9
+ ---
10
+
11
+ ## ESPnet2 CLS model
12
+
13
+ ### `espnet/OpenBEATS-Base-i2-fsd50k`
14
+
15
+ This model was trained by Shikhar Bharadwaj using fsd50k recipe in [espnet](https://github.com/espnet/espnet/).
16
+
17
+ ## CLS config
18
+
19
+ <details><summary>expand</summary>
20
+
21
+ ```
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earbasei2/conf/ear_base/fsd50k.yaml
23
+ print_config: false
24
+ log_level: INFO
25
+ drop_last_iter: false
26
+ dry_run: false
27
+ iterator_type: sequence
28
+ valid_iterator_type: null
29
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
30
+ ngpu: 0
31
+ seed: 0
32
+ num_workers: 8
33
+ num_att_plot: 0
34
+ dist_backend: nccl
35
+ dist_init_method: env://
36
+ dist_world_size: null
37
+ dist_rank: null
38
+ local_rank: null
39
+ dist_master_addr: null
40
+ dist_master_port: null
41
+ dist_launcher: null
42
+ multiprocessing_distributed: false
43
+ unused_parameters: true
44
+ sharded_ddp: false
45
+ use_deepspeed: false
46
+ deepspeed_config: null
47
+ gradient_as_bucket_view: true
48
+ ddp_comm_hook: null
49
+ cudnn_enabled: true
50
+ cudnn_benchmark: false
51
+ cudnn_deterministic: true
52
+ use_tf32: false
53
+ collect_stats: false
54
+ write_collected_feats: false
55
+ max_epoch: 105
56
+ patience: null
57
+ val_scheduler_criterion:
58
+ - valid
59
+ - loss
60
+ early_stopping_criterion:
61
+ - valid
62
+ - loss
63
+ - min
64
+ best_model_criterion:
65
+ - - valid
66
+ - epoch_mAP
67
+ - max
68
+ keep_nbest_models: 5
69
+ nbest_averaging_interval: 0
70
+ grad_clip: 1
71
+ grad_clip_type: 2.0
72
+ grad_noise: false
73
+ accum_grad: 1
74
+ no_forward_run: false
75
+ resume: true
76
+ train_dtype: float32
77
+ use_amp: false
78
+ log_interval: null
79
+ use_matplotlib: true
80
+ use_tensorboard: true
81
+ create_graph_in_tensorboard: false
82
+ use_wandb: true
83
+ wandb_project: audioverse
84
+ wandb_id: null
85
+ wandb_entity: shikhar
86
+ wandb_name: fsd50k.earbasei2
87
+ wandb_model_log_interval: -1
88
+ detect_anomaly: false
89
+ use_adapter: false
90
+ adapter: lora
91
+ save_strategy: all
92
+ adapter_conf: {}
93
+ pretrain_path: null
94
+ init_param: []
95
+ ignore_init_mismatch: false
96
+ freeze_param: []
97
+ num_iters_per_epoch: null
98
+ batch_size: 20
99
+ valid_batch_size: null
100
+ batch_bins: 3000000
101
+ valid_batch_bins: null
102
+ category_sample_size: 10
103
+ train_shape_file:
104
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/speech_shape
105
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/label_shape
106
+ valid_shape_file:
107
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/speech_shape
108
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/label_shape
109
+ batch_type: numel
110
+ valid_batch_type: null
111
+ fold_length:
112
+ - 160000
113
+ - 200
114
+ sort_in_batch: descending
115
+ shuffle_within_batch: false
116
+ sort_batch: descending
117
+ multiple_iterator: false
118
+ utt2weight_file: null
119
+ chunk_length: 500
120
+ chunk_shift_ratio: 0.5
121
+ num_cache_chunks: 1024
122
+ chunk_excluded_key_prefixes: []
123
+ chunk_default_fs: null
124
+ chunk_max_abs_length: null
125
+ chunk_discard_short_samples: true
126
+ train_data_path_and_name_and_type:
127
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/wav.scp
128
+ - speech
129
+ - sound
130
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/text
131
+ - label
132
+ - text
133
+ valid_data_path_and_name_and_type:
134
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/wav.scp
135
+ - speech
136
+ - sound
137
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/text
138
+ - label
139
+ - text
140
+ multi_task_dataset: false
141
+ allow_variable_data_keys: false
142
+ max_cache_size: 0.0
143
+ max_cache_fd: 32
144
+ allow_multi_rates: false
145
+ valid_max_cache_size: null
146
+ exclude_weight_decay: false
147
+ exclude_weight_decay_conf: {}
148
+ optim: adamw
149
+ optim_conf:
150
+ lr: 3.0e-05
151
+ weight_decay: 0.01
152
+ betas:
153
+ - 0.9
154
+ - 0.98
155
+ scheduler: cosineannealingwarmuprestarts
156
+ scheduler_conf:
157
+ first_cycle_steps: 95000
158
+ warmup_steps: 8000
159
+ max_lr: 3.0e-05
160
+ min_lr: 5.0e-06
161
+ lightning_conf:
162
+ log_every_n_steps: 250
163
+ max_epochs: 105
164
+ strategy: ddp
165
+ strategy_conf:
166
+ find_unused_parameters: true
167
+ best_model_criterion:
168
+ - - valid/epoch_mAP
169
+ - max
170
+ - 1
171
+ devices: 1
172
+ num_nodes: 1
173
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
174
+ token_list:
175
+ - Music
176
+ - Musical_instrument
177
+ - Domestic_sounds_and_home_sounds
178
+ - Human_voice
179
+ - Animal
180
+ - Percussion
181
+ - Wind_instrument_and_woodwind_instrument
182
+ - Vehicle
183
+ - Bowed_string_instrument
184
+ - Plucked_string_instrument
185
+ - Guitar
186
+ - Wild_animals
187
+ - Speech
188
+ - Keyboard_(musical)
189
+ - Water
190
+ - Motor_vehicle_(road)
191
+ - Alarm
192
+ - Bird
193
+ - Drum
194
+ - Liquid
195
+ - Explosion
196
+ - Mechanisms
197
+ - Domestic_animals_and_pets
198
+ - Door
199
+ - Laughter
200
+ - Brass_instrument
201
+ - Glass
202
+ - Engine
203
+ - Respiratory_sounds
204
+ - Tools
205
+ - Bell
206
+ - Piano
207
+ - Dog
208
+ - Human_group_actions
209
+ - Snare_drum
210
+ - Car
211
+ - Cymbal
212
+ - Rail_transport
213
+ - Trumpet
214
+ - Telephone
215
+ - Hands
216
+ - Singing
217
+ - Fart
218
+ - Acoustic_guitar
219
+ - Bird_vocalization_and_bird_call_and_bird_song
220
+ - Rain
221
+ - Livestock_and_farm_animals_and_working_animals
222
+ - Electric_guitar
223
+ - Breathing
224
+ - Thunderstorm
225
+ - Thunder
226
+ - Hi-hat
227
+ - Coin_(dropping)
228
+ - Fire
229
+ - Bark
230
+ - Shatter
231
+ - Female_speech_and_woman_speaking
232
+ - Fireworks
233
+ - Insect
234
+ - Male_speech_and_man_speaking
235
+ - Squeak
236
+ - Applause
237
+ - Clapping
238
+ - Walk_and_footsteps
239
+ - Splash_and_splatter
240
+ - Slam
241
+ - Gunshot_and_gunfire
242
+ - Drum_kit
243
+ - Train
244
+ - Dishes_and_pots_and_pans
245
+ - Bass_guitar
246
+ - Organ
247
+ - Wood
248
+ - Cat
249
+ - Subway_and_metro_and_underground
250
+ - Thump_and_thud
251
+ - Typing
252
+ - Camera
253
+ - Mallet_percussion
254
+ - Wind
255
+ - Zipper_(clothing)
256
+ - Sink_(filling_or_washing)
257
+ - Water_tap_and_faucet
258
+ - Cough
259
+ - Clock
260
+ - Crowd
261
+ - Tearing
262
+ - Whoosh_and_swoosh_and_swish
263
+ - Knock
264
+ - Cutlery_and_silverware
265
+ - Rattle_(instrument)
266
+ - Writing
267
+ - Screaming
268
+ - Chink_and_clink
269
+ - Ocean
270
+ - Run
271
+ - Tap
272
+ - Fowl
273
+ - Scratching_(performance_technique)
274
+ - Drip
275
+ - Bicycle
276
+ - Tambourine
277
+ - Burping_and_eructation
278
+ - Crackle
279
+ - Shout
280
+ - Bass_drum
281
+ - Bus
282
+ - Stream
283
+ - Crash_cymbal
284
+ - Cheering
285
+ - Toilet_flush
286
+ - Gong
287
+ - Crumpling_and_crinkling
288
+ - Aircraft
289
+ - Sliding_door
290
+ - Chirp_and_tweet
291
+ - Crushing
292
+ - Strum
293
+ - Chime
294
+ - Chewing_and_mastication
295
+ - Marimba_and_xylophone
296
+ - Harp
297
+ - Cricket
298
+ - Cowbell
299
+ - Meow
300
+ - Motorcycle
301
+ - Keys_jangling
302
+ - Whispering
303
+ - Power_tool
304
+ - Waves_and_surf
305
+ - Boom
306
+ - Drill
307
+ - Hammer
308
+ - Harmonica
309
+ - Accelerating_and_revving_and_vroom
310
+ - Hiss
311
+ - Child_speech_and_kid_speaking
312
+ - Rattle
313
+ - Drawer_open_or_close
314
+ - Bathtub_(filling_or_washing)
315
+ - Trickle_and_dribble
316
+ - Pour
317
+ - Microwave_oven
318
+ - Traffic_noise_and_roadway_noise
319
+ - Engine_starting
320
+ - Yell
321
+ - Chicken_and_rooster
322
+ - Female_singing
323
+ - Finger_snapping
324
+ - Computer_keyboard
325
+ - Car_passing_by
326
+ - Gurgling
327
+ - Raindrop
328
+ - Crack
329
+ - Sawing
330
+ - Vehicle_horn_and_car_horn_and_honking
331
+ - Truck
332
+ - Crying_and_sobbing
333
+ - Idling
334
+ - Doorbell
335
+ - Scissors
336
+ - Boat_and_Water_vehicle
337
+ - Fixed-wing_aircraft_and_airplane
338
+ - Giggle
339
+ - Printer
340
+ - Cupboard_open_or_close
341
+ - Tick-tock
342
+ - Ringtone
343
+ - Fill_(with_liquid)
344
+ - Skateboard
345
+ - Male_singing
346
+ - Screech
347
+ - Church_bell
348
+ - Buzz
349
+ - Siren
350
+ - Crow
351
+ - Sigh
352
+ - Race_car_and_auto_racing
353
+ - Growling
354
+ - Frog
355
+ - Gull_and_seagull
356
+ - Packing_tape_and_duct_tape
357
+ - Bicycle_bell
358
+ - Frying_(food)
359
+ - Chatter
360
+ - Boiling
361
+ - Wind_chime
362
+ - Sneeze
363
+ - Mechanical_fan
364
+ - Purr
365
+ - Speech_synthesizer
366
+ - Conversation
367
+ - Ratchet_and_pawl
368
+ - Gasp
369
+ - Chuckle_and_chortle
370
+ - Glockenspiel
371
+ - Accordion
372
+ - Tabla
373
+ - Typewriter
374
+ - Tick
375
+ - <blank>
376
+ - <unk>
377
+ text_token_list: null
378
+ text_bpemodel: null
379
+ init: xavier_normal
380
+ input_size: 1
381
+ use_preprocessor: true
382
+ frontend: null
383
+ frontend_conf: {}
384
+ specaug: null
385
+ specaug_conf: {}
386
+ normalize: null
387
+ normalize_conf: {}
388
+ preencoder: null
389
+ preencoder_conf: {}
390
+ encoder: beats
391
+ encoder_conf:
392
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_base/beats_iter1_base.tune_lr5e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
393
+ beats_config:
394
+ layer_wise_gradient_decay_ratio: 0.3
395
+ encoder_layerdrop: 0.1
396
+ dropout: 0.0
397
+ use_weighted_representation: false
398
+ specaug_config:
399
+ apply_time_warp: true
400
+ apply_freq_mask: false
401
+ apply_time_mask: true
402
+ time_mask_width_ratio_range:
403
+ - 0
404
+ - 0.06
405
+ num_time_mask: 1
406
+ roll_augment: true
407
+ roll_interval: 1
408
+ text_encoder: null
409
+ text_encoder_conf: {}
410
+ embedding_fusion: null
411
+ embedding_fusion_conf: {}
412
+ decoder: linear
413
+ decoder_conf: {}
414
+ model: espnet
415
+ model_conf:
416
+ classification_type: multi-label
417
+ mixup_probability: 0.2
418
+ lsm_weight: 0.0
419
+ log_epoch_metrics: true
420
+ user_callbacks:
421
+ - mAP_logging
422
+ required:
423
+ - output_dir
424
+ - token_list
425
+ task: cls
426
+ ```
427
+
428
+ </details>
429
+
430
+ ### Citations
431
+
432
+ ```BibTex
433
+
434
+ @article{bharadwaj2025openbeats,
435
+ title={OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder},
436
+ author={Bharadwaj, Shikhar and Cornell, Samuele and Choi, Kwanghee and Fukayama, Satoru and Shim, Hye-jin and Deshmukh, Soham and Watanabe, Shinji},
437
+ journal={arXiv preprint arXiv:2507.14129},
438
+ year={2025}
439
+ }
440
+
441
+ @inproceedings{watanabe2018espnet,
442
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
443
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
444
+ year={2018},
445
+ booktitle={Proceedings of Interspeech},
446
+ pages={2207--2211},
447
+ doi={10.21437/Interspeech.2018-1456},
448
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
449
+ }
450
+
451
+
452
+
453
+
454
+
455
+
456
+ ```
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ classification_model_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/valid.epoch_mAP.ave_1best.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1763330803.045668
6
+ torch: 2.1.2
7
+ yaml_files:
8
+ classification_train_config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/config.yaml
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/fsd50k/token_list ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Music
2
+ Musical_instrument
3
+ Domestic_sounds_and_home_sounds
4
+ Human_voice
5
+ Animal
6
+ Percussion
7
+ Wind_instrument_and_woodwind_instrument
8
+ Vehicle
9
+ Bowed_string_instrument
10
+ Plucked_string_instrument
11
+ Guitar
12
+ Wild_animals
13
+ Speech
14
+ Keyboard_(musical)
15
+ Water
16
+ Motor_vehicle_(road)
17
+ Alarm
18
+ Bird
19
+ Drum
20
+ Liquid
21
+ Explosion
22
+ Mechanisms
23
+ Domestic_animals_and_pets
24
+ Door
25
+ Laughter
26
+ Brass_instrument
27
+ Glass
28
+ Engine
29
+ Respiratory_sounds
30
+ Tools
31
+ Bell
32
+ Piano
33
+ Dog
34
+ Human_group_actions
35
+ Snare_drum
36
+ Car
37
+ Cymbal
38
+ Rail_transport
39
+ Trumpet
40
+ Telephone
41
+ Hands
42
+ Singing
43
+ Fart
44
+ Acoustic_guitar
45
+ Bird_vocalization_and_bird_call_and_bird_song
46
+ Rain
47
+ Livestock_and_farm_animals_and_working_animals
48
+ Electric_guitar
49
+ Breathing
50
+ Thunderstorm
51
+ Thunder
52
+ Hi-hat
53
+ Coin_(dropping)
54
+ Fire
55
+ Bark
56
+ Shatter
57
+ Female_speech_and_woman_speaking
58
+ Fireworks
59
+ Insect
60
+ Male_speech_and_man_speaking
61
+ Squeak
62
+ Applause
63
+ Clapping
64
+ Walk_and_footsteps
65
+ Splash_and_splatter
66
+ Slam
67
+ Gunshot_and_gunfire
68
+ Drum_kit
69
+ Train
70
+ Dishes_and_pots_and_pans
71
+ Bass_guitar
72
+ Organ
73
+ Wood
74
+ Cat
75
+ Subway_and_metro_and_underground
76
+ Thump_and_thud
77
+ Typing
78
+ Camera
79
+ Mallet_percussion
80
+ Wind
81
+ Zipper_(clothing)
82
+ Sink_(filling_or_washing)
83
+ Water_tap_and_faucet
84
+ Cough
85
+ Clock
86
+ Crowd
87
+ Tearing
88
+ Whoosh_and_swoosh_and_swish
89
+ Knock
90
+ Cutlery_and_silverware
91
+ Rattle_(instrument)
92
+ Writing
93
+ Screaming
94
+ Chink_and_clink
95
+ Ocean
96
+ Run
97
+ Tap
98
+ Fowl
99
+ Scratching_(performance_technique)
100
+ Drip
101
+ Bicycle
102
+ Tambourine
103
+ Burping_and_eructation
104
+ Crackle
105
+ Shout
106
+ Bass_drum
107
+ Bus
108
+ Stream
109
+ Crash_cymbal
110
+ Cheering
111
+ Toilet_flush
112
+ Gong
113
+ Crumpling_and_crinkling
114
+ Aircraft
115
+ Sliding_door
116
+ Chirp_and_tweet
117
+ Crushing
118
+ Strum
119
+ Chime
120
+ Chewing_and_mastication
121
+ Marimba_and_xylophone
122
+ Harp
123
+ Cricket
124
+ Cowbell
125
+ Meow
126
+ Motorcycle
127
+ Keys_jangling
128
+ Whispering
129
+ Power_tool
130
+ Waves_and_surf
131
+ Boom
132
+ Drill
133
+ Hammer
134
+ Harmonica
135
+ Accelerating_and_revving_and_vroom
136
+ Hiss
137
+ Child_speech_and_kid_speaking
138
+ Rattle
139
+ Drawer_open_or_close
140
+ Bathtub_(filling_or_washing)
141
+ Trickle_and_dribble
142
+ Pour
143
+ Microwave_oven
144
+ Traffic_noise_and_roadway_noise
145
+ Engine_starting
146
+ Yell
147
+ Chicken_and_rooster
148
+ Female_singing
149
+ Finger_snapping
150
+ Computer_keyboard
151
+ Car_passing_by
152
+ Gurgling
153
+ Raindrop
154
+ Crack
155
+ Sawing
156
+ Vehicle_horn_and_car_horn_and_honking
157
+ Truck
158
+ Crying_and_sobbing
159
+ Idling
160
+ Doorbell
161
+ Scissors
162
+ Boat_and_Water_vehicle
163
+ Fixed-wing_aircraft_and_airplane
164
+ Giggle
165
+ Printer
166
+ Cupboard_open_or_close
167
+ Tick-tock
168
+ Ringtone
169
+ Fill_(with_liquid)
170
+ Skateboard
171
+ Male_singing
172
+ Screech
173
+ Church_bell
174
+ Buzz
175
+ Siren
176
+ Crow
177
+ Sigh
178
+ Race_car_and_auto_racing
179
+ Growling
180
+ Frog
181
+ Gull_and_seagull
182
+ Packing_tape_and_duct_tape
183
+ Bicycle_bell
184
+ Frying_(food)
185
+ Chatter
186
+ Boiling
187
+ Wind_chime
188
+ Sneeze
189
+ Mechanical_fan
190
+ Purr
191
+ Speech_synthesizer
192
+ Conversation
193
+ Ratchet_and_pawl
194
+ Gasp
195
+ Chuckle_and_chortle
196
+ Glockenspiel
197
+ Accordion
198
+ Tabla
199
+ Typewriter
200
+ Tick
201
+ <blank>
202
+ <unk>
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/RESULTS.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_cls_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Mar 19 22:56:10 CDT 2025`
5
+ - python version: `3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) [GCC 12.3.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.6.0.dev20241210+cu124`
8
+ - Git hash: `0e83599887130b7c351074669974ed8642016651`
9
+ - Commit date: `Wed Mar 19 20:32:11 2025 -0500`
10
+
11
+ ## cls_earbasei2
12
+ |Split|mean_acc|mAP|mean_auc|n_labels|n_instances|
13
+ |---|---|---|---|---|---|
14
+ cls_test|43.43|55.84|94.82|200.00|10231.00
15
+ cls_val|45.26|58.59|96.81|200.00|4165.00
16
+
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/config.yaml ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earbasei2/conf/ear_base/fsd50k.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
9
+ ngpu: 0
10
+ seed: 0
11
+ num_workers: 8
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: null
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 105
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - epoch_mAP
46
+ - max
47
+ keep_nbest_models: 5
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 1
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 1
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: false
57
+ log_interval: null
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: true
62
+ wandb_project: audioverse
63
+ wandb_id: null
64
+ wandb_entity: shikhar
65
+ wandb_name: fsd50k.earbasei2
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: null
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 3000000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ train_shape_file:
83
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/speech_shape
84
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/label_shape
85
+ valid_shape_file:
86
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/speech_shape
87
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/label_shape
88
+ batch_type: numel
89
+ valid_batch_type: null
90
+ fold_length:
91
+ - 160000
92
+ - 200
93
+ sort_in_batch: descending
94
+ shuffle_within_batch: false
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ utt2weight_file: null
98
+ chunk_length: 500
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 1024
101
+ chunk_excluded_key_prefixes: []
102
+ chunk_default_fs: null
103
+ chunk_max_abs_length: null
104
+ chunk_discard_short_samples: true
105
+ train_data_path_and_name_and_type:
106
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/wav.scp
107
+ - speech
108
+ - sound
109
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/text
110
+ - label
111
+ - text
112
+ valid_data_path_and_name_and_type:
113
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/wav.scp
114
+ - speech
115
+ - sound
116
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/text
117
+ - label
118
+ - text
119
+ multi_task_dataset: false
120
+ allow_variable_data_keys: false
121
+ max_cache_size: 0.0
122
+ max_cache_fd: 32
123
+ allow_multi_rates: false
124
+ valid_max_cache_size: null
125
+ exclude_weight_decay: false
126
+ exclude_weight_decay_conf: {}
127
+ optim: adamw
128
+ optim_conf:
129
+ lr: 3.0e-05
130
+ weight_decay: 0.01
131
+ betas:
132
+ - 0.9
133
+ - 0.98
134
+ scheduler: cosineannealingwarmuprestarts
135
+ scheduler_conf:
136
+ first_cycle_steps: 95000
137
+ warmup_steps: 8000
138
+ max_lr: 3.0e-05
139
+ min_lr: 5.0e-06
140
+ lightning_conf:
141
+ log_every_n_steps: 250
142
+ max_epochs: 105
143
+ strategy: ddp
144
+ strategy_conf:
145
+ find_unused_parameters: true
146
+ best_model_criterion:
147
+ - - valid/epoch_mAP
148
+ - max
149
+ - 1
150
+ devices: 1
151
+ num_nodes: 1
152
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
153
+ token_list:
154
+ - Music
155
+ - Musical_instrument
156
+ - Domestic_sounds_and_home_sounds
157
+ - Human_voice
158
+ - Animal
159
+ - Percussion
160
+ - Wind_instrument_and_woodwind_instrument
161
+ - Vehicle
162
+ - Bowed_string_instrument
163
+ - Plucked_string_instrument
164
+ - Guitar
165
+ - Wild_animals
166
+ - Speech
167
+ - Keyboard_(musical)
168
+ - Water
169
+ - Motor_vehicle_(road)
170
+ - Alarm
171
+ - Bird
172
+ - Drum
173
+ - Liquid
174
+ - Explosion
175
+ - Mechanisms
176
+ - Domestic_animals_and_pets
177
+ - Door
178
+ - Laughter
179
+ - Brass_instrument
180
+ - Glass
181
+ - Engine
182
+ - Respiratory_sounds
183
+ - Tools
184
+ - Bell
185
+ - Piano
186
+ - Dog
187
+ - Human_group_actions
188
+ - Snare_drum
189
+ - Car
190
+ - Cymbal
191
+ - Rail_transport
192
+ - Trumpet
193
+ - Telephone
194
+ - Hands
195
+ - Singing
196
+ - Fart
197
+ - Acoustic_guitar
198
+ - Bird_vocalization_and_bird_call_and_bird_song
199
+ - Rain
200
+ - Livestock_and_farm_animals_and_working_animals
201
+ - Electric_guitar
202
+ - Breathing
203
+ - Thunderstorm
204
+ - Thunder
205
+ - Hi-hat
206
+ - Coin_(dropping)
207
+ - Fire
208
+ - Bark
209
+ - Shatter
210
+ - Female_speech_and_woman_speaking
211
+ - Fireworks
212
+ - Insect
213
+ - Male_speech_and_man_speaking
214
+ - Squeak
215
+ - Applause
216
+ - Clapping
217
+ - Walk_and_footsteps
218
+ - Splash_and_splatter
219
+ - Slam
220
+ - Gunshot_and_gunfire
221
+ - Drum_kit
222
+ - Train
223
+ - Dishes_and_pots_and_pans
224
+ - Bass_guitar
225
+ - Organ
226
+ - Wood
227
+ - Cat
228
+ - Subway_and_metro_and_underground
229
+ - Thump_and_thud
230
+ - Typing
231
+ - Camera
232
+ - Mallet_percussion
233
+ - Wind
234
+ - Zipper_(clothing)
235
+ - Sink_(filling_or_washing)
236
+ - Water_tap_and_faucet
237
+ - Cough
238
+ - Clock
239
+ - Crowd
240
+ - Tearing
241
+ - Whoosh_and_swoosh_and_swish
242
+ - Knock
243
+ - Cutlery_and_silverware
244
+ - Rattle_(instrument)
245
+ - Writing
246
+ - Screaming
247
+ - Chink_and_clink
248
+ - Ocean
249
+ - Run
250
+ - Tap
251
+ - Fowl
252
+ - Scratching_(performance_technique)
253
+ - Drip
254
+ - Bicycle
255
+ - Tambourine
256
+ - Burping_and_eructation
257
+ - Crackle
258
+ - Shout
259
+ - Bass_drum
260
+ - Bus
261
+ - Stream
262
+ - Crash_cymbal
263
+ - Cheering
264
+ - Toilet_flush
265
+ - Gong
266
+ - Crumpling_and_crinkling
267
+ - Aircraft
268
+ - Sliding_door
269
+ - Chirp_and_tweet
270
+ - Crushing
271
+ - Strum
272
+ - Chime
273
+ - Chewing_and_mastication
274
+ - Marimba_and_xylophone
275
+ - Harp
276
+ - Cricket
277
+ - Cowbell
278
+ - Meow
279
+ - Motorcycle
280
+ - Keys_jangling
281
+ - Whispering
282
+ - Power_tool
283
+ - Waves_and_surf
284
+ - Boom
285
+ - Drill
286
+ - Hammer
287
+ - Harmonica
288
+ - Accelerating_and_revving_and_vroom
289
+ - Hiss
290
+ - Child_speech_and_kid_speaking
291
+ - Rattle
292
+ - Drawer_open_or_close
293
+ - Bathtub_(filling_or_washing)
294
+ - Trickle_and_dribble
295
+ - Pour
296
+ - Microwave_oven
297
+ - Traffic_noise_and_roadway_noise
298
+ - Engine_starting
299
+ - Yell
300
+ - Chicken_and_rooster
301
+ - Female_singing
302
+ - Finger_snapping
303
+ - Computer_keyboard
304
+ - Car_passing_by
305
+ - Gurgling
306
+ - Raindrop
307
+ - Crack
308
+ - Sawing
309
+ - Vehicle_horn_and_car_horn_and_honking
310
+ - Truck
311
+ - Crying_and_sobbing
312
+ - Idling
313
+ - Doorbell
314
+ - Scissors
315
+ - Boat_and_Water_vehicle
316
+ - Fixed-wing_aircraft_and_airplane
317
+ - Giggle
318
+ - Printer
319
+ - Cupboard_open_or_close
320
+ - Tick-tock
321
+ - Ringtone
322
+ - Fill_(with_liquid)
323
+ - Skateboard
324
+ - Male_singing
325
+ - Screech
326
+ - Church_bell
327
+ - Buzz
328
+ - Siren
329
+ - Crow
330
+ - Sigh
331
+ - Race_car_and_auto_racing
332
+ - Growling
333
+ - Frog
334
+ - Gull_and_seagull
335
+ - Packing_tape_and_duct_tape
336
+ - Bicycle_bell
337
+ - Frying_(food)
338
+ - Chatter
339
+ - Boiling
340
+ - Wind_chime
341
+ - Sneeze
342
+ - Mechanical_fan
343
+ - Purr
344
+ - Speech_synthesizer
345
+ - Conversation
346
+ - Ratchet_and_pawl
347
+ - Gasp
348
+ - Chuckle_and_chortle
349
+ - Glockenspiel
350
+ - Accordion
351
+ - Tabla
352
+ - Typewriter
353
+ - Tick
354
+ - <blank>
355
+ - <unk>
356
+ text_token_list: null
357
+ text_bpemodel: null
358
+ init: xavier_normal
359
+ input_size: 1
360
+ use_preprocessor: true
361
+ frontend: null
362
+ frontend_conf: {}
363
+ specaug: null
364
+ specaug_conf: {}
365
+ normalize: null
366
+ normalize_conf: {}
367
+ preencoder: null
368
+ preencoder_conf: {}
369
+ encoder: beats
370
+ encoder_conf:
371
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_base/beats_iter1_base.tune_lr5e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
372
+ beats_config:
373
+ layer_wise_gradient_decay_ratio: 0.3
374
+ encoder_layerdrop: 0.1
375
+ dropout: 0.0
376
+ use_weighted_representation: false
377
+ specaug_config:
378
+ apply_time_warp: true
379
+ apply_freq_mask: false
380
+ apply_time_mask: true
381
+ time_mask_width_ratio_range:
382
+ - 0
383
+ - 0.06
384
+ num_time_mask: 1
385
+ roll_augment: true
386
+ roll_interval: 1
387
+ text_encoder: null
388
+ text_encoder_conf: {}
389
+ embedding_fusion: null
390
+ embedding_fusion_conf: {}
391
+ decoder: linear
392
+ decoder_conf: {}
393
+ model: espnet
394
+ model_conf:
395
+ classification_type: multi-label
396
+ mixup_probability: 0.2
397
+ lsm_weight: 0.0
398
+ log_epoch_metrics: true
399
+ user_callbacks:
400
+ - mAP_logging
401
+ required:
402
+ - output_dir
403
+ - token_list
404
+ task: cls
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/lightning_logs/version_0/events.out.tfevents.1742423411.gh015.hsn.cm.delta.internal.ncsa.edu.1268224.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f8960e055b1598586dd9c02d57f0c17c79a7c412d546647c6bd8278f7334e7f
3
+ size 157218
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/lightning_logs/version_0/hparams.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ args: !!python/object:argparse.Namespace
2
+ accum_grad: 1
3
+ adapter: lora
4
+ adapter_conf: {}
5
+ allow_multi_rates: false
6
+ allow_variable_data_keys: false
7
+ batch_bins: 3000000
8
+ batch_size: 20
9
+ batch_type: numel
10
+ best_model_criterion:
11
+ - - valid
12
+ - epoch_mAP
13
+ - max
14
+ category_sample_size: 10
15
+ chunk_default_fs: null
16
+ chunk_discard_short_samples: true
17
+ chunk_excluded_key_prefixes: []
18
+ chunk_length: 500
19
+ chunk_max_abs_length: null
20
+ chunk_shift_ratio: 0.5
21
+ collect_stats: false
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earbasei2/conf/ear_base/fsd50k.yaml
23
+ create_graph_in_tensorboard: false
24
+ cudnn_benchmark: false
25
+ cudnn_deterministic: true
26
+ cudnn_enabled: true
27
+ ddp_comm_hook: null
28
+ decoder: linear
29
+ decoder_conf: {}
30
+ deepspeed_config: null
31
+ detect_anomaly: false
32
+ dist_backend: nccl
33
+ dist_init_method: env://
34
+ dist_launcher: null
35
+ dist_master_addr: null
36
+ dist_master_port: null
37
+ dist_rank: null
38
+ dist_world_size: null
39
+ drop_last_iter: false
40
+ dry_run: false
41
+ early_stopping_criterion: !!python/tuple
42
+ - valid
43
+ - loss
44
+ - min
45
+ embedding_fusion: null
46
+ embedding_fusion_conf: {}
47
+ encoder: beats
48
+ encoder_conf:
49
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_base/beats_iter1_base.tune_lr5e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
50
+ beats_config:
51
+ dropout: 0.0
52
+ encoder_layerdrop: 0.1
53
+ layer_wise_gradient_decay_ratio: 0.3
54
+ roll_augment: true
55
+ roll_interval: 1
56
+ specaug_config:
57
+ apply_freq_mask: false
58
+ apply_time_mask: true
59
+ apply_time_warp: true
60
+ num_time_mask: 1
61
+ time_mask_width_ratio_range:
62
+ - 0
63
+ - 0.06
64
+ use_weighted_representation: false
65
+ exclude_weight_decay: false
66
+ exclude_weight_decay_conf: {}
67
+ fold_length:
68
+ - 160000
69
+ - 200
70
+ freeze_param: []
71
+ frontend: null
72
+ frontend_conf:
73
+ fs: 16k
74
+ grad_clip: 1
75
+ grad_clip_type: 2.0
76
+ grad_noise: false
77
+ gradient_as_bucket_view: true
78
+ ignore_init_mismatch: false
79
+ init: xavier_normal
80
+ init_param: []
81
+ input_size: 1
82
+ iterator_type: sequence
83
+ keep_nbest_models: 5
84
+ lightning_conf:
85
+ best_model_criterion:
86
+ - - valid/epoch_mAP
87
+ - max
88
+ - 1
89
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
90
+ devices: 1
91
+ log_every_n_steps: 250
92
+ num_nodes: 1
93
+ strategy: ddp
94
+ strategy_conf:
95
+ find_unused_parameters: true
96
+ local_rank: null
97
+ log_interval: null
98
+ log_level: INFO
99
+ max_cache_fd: 32
100
+ max_cache_size: 0.0
101
+ max_epoch: 60
102
+ model: espnet
103
+ model_conf:
104
+ classification_type: multi-label
105
+ log_epoch_metrics: true
106
+ lsm_weight: 0.0
107
+ mixup_probability: 0.2
108
+ multi_task_dataset: false
109
+ multiple_iterator: false
110
+ multiprocessing_distributed: false
111
+ nbest_averaging_interval: 0
112
+ ngpu: 0
113
+ no_forward_run: false
114
+ normalize: null
115
+ normalize_conf: {}
116
+ num_att_plot: 0
117
+ num_cache_chunks: 1024
118
+ num_iters_per_epoch: null
119
+ num_workers: 8
120
+ optim: adamw
121
+ optim_conf:
122
+ betas:
123
+ - 0.9
124
+ - 0.98
125
+ lr: 3.0e-05
126
+ weight_decay: 0.01
127
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
128
+ patience: null
129
+ preencoder: null
130
+ preencoder_conf: {}
131
+ pretrain_path: null
132
+ print_config: false
133
+ required:
134
+ - output_dir
135
+ - token_list
136
+ resume: true
137
+ save_strategy: all
138
+ scheduler: cosineannealingwarmuprestarts
139
+ scheduler_conf:
140
+ first_cycle_steps: 95000
141
+ max_lr: 3.0e-05
142
+ min_lr: 5.0e-06
143
+ warmup_steps: 8000
144
+ seed: 0
145
+ sharded_ddp: false
146
+ shuffle_within_batch: false
147
+ sort_batch: descending
148
+ sort_in_batch: descending
149
+ specaug: null
150
+ specaug_conf: {}
151
+ task: cls
152
+ text_bpemodel: null
153
+ text_encoder: null
154
+ text_encoder_conf: {}
155
+ text_token_list: null
156
+ token_list: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/fsd50k/token_list
157
+ train_data_path_and_name_and_type:
158
+ - !!python/tuple
159
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/wav.scp
160
+ - speech
161
+ - sound
162
+ - !!python/tuple
163
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/text
164
+ - label
165
+ - text
166
+ train_dtype: float32
167
+ train_shape_file:
168
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/speech_shape
169
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/label_shape
170
+ unused_parameters: true
171
+ use_adapter: false
172
+ use_amp: false
173
+ use_deepspeed: false
174
+ use_matplotlib: true
175
+ use_preprocessor: true
176
+ use_tensorboard: true
177
+ use_tf32: false
178
+ use_wandb: true
179
+ user_callbacks:
180
+ - mAP_logging
181
+ utt2weight_file: null
182
+ val_scheduler_criterion: !!python/tuple
183
+ - valid
184
+ - loss
185
+ valid_batch_bins: null
186
+ valid_batch_size: null
187
+ valid_batch_type: null
188
+ valid_data_path_and_name_and_type:
189
+ - !!python/tuple
190
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/wav.scp
191
+ - speech
192
+ - sound
193
+ - !!python/tuple
194
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/text
195
+ - label
196
+ - text
197
+ valid_iterator_type: null
198
+ valid_max_cache_size: null
199
+ valid_shape_file:
200
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/speech_shape
201
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/label_shape
202
+ wandb_entity: shikhar
203
+ wandb_id: null
204
+ wandb_model_log_interval: -1
205
+ wandb_name: fsd50k.earbasei2
206
+ wandb_project: audioverse
207
+ write_collected_feats: false
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/lightning_logs/version_1/events.out.tfevents.1742442145.gh096.hsn.cm.delta.internal.ncsa.edu.1929255.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71dfd792f5cea27e78bdf8a04ba6d04d9a350855a900e27cb886a0a23f0d2cc4
3
+ size 17146
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/lightning_logs/version_1/hparams.yaml ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ args: !!python/object:argparse.Namespace
2
+ accum_grad: 1
3
+ adapter: lora
4
+ adapter_conf: {}
5
+ allow_multi_rates: false
6
+ allow_variable_data_keys: false
7
+ batch_bins: 3000000
8
+ batch_size: 20
9
+ batch_type: numel
10
+ best_model_criterion:
11
+ - - valid
12
+ - epoch_mAP
13
+ - max
14
+ category_sample_size: 10
15
+ chunk_default_fs: null
16
+ chunk_discard_short_samples: true
17
+ chunk_excluded_key_prefixes: []
18
+ chunk_length: 500
19
+ chunk_max_abs_length: null
20
+ chunk_shift_ratio: 0.5
21
+ collect_stats: false
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earbasei2/conf/ear_base/fsd50k.yaml
23
+ create_graph_in_tensorboard: false
24
+ cudnn_benchmark: false
25
+ cudnn_deterministic: true
26
+ cudnn_enabled: true
27
+ ddp_comm_hook: null
28
+ decoder: linear
29
+ decoder_conf: {}
30
+ deepspeed_config: null
31
+ detect_anomaly: false
32
+ dist_backend: nccl
33
+ dist_init_method: env://
34
+ dist_launcher: null
35
+ dist_master_addr: null
36
+ dist_master_port: null
37
+ dist_rank: null
38
+ dist_world_size: null
39
+ drop_last_iter: false
40
+ dry_run: false
41
+ early_stopping_criterion: !!python/tuple
42
+ - valid
43
+ - loss
44
+ - min
45
+ embedding_fusion: null
46
+ embedding_fusion_conf: {}
47
+ encoder: beats
48
+ encoder_conf:
49
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/model_checkpoints/ear_base/beats_iter1_base.tune_lr5e-4_warmup40000_bins1600000_totalsteps400000/epoch59.pt
50
+ beats_config:
51
+ dropout: 0.0
52
+ encoder_layerdrop: 0.1
53
+ layer_wise_gradient_decay_ratio: 0.3
54
+ roll_augment: true
55
+ roll_interval: 1
56
+ specaug_config:
57
+ apply_freq_mask: false
58
+ apply_time_mask: true
59
+ apply_time_warp: true
60
+ num_time_mask: 1
61
+ time_mask_width_ratio_range:
62
+ - 0
63
+ - 0.06
64
+ use_weighted_representation: false
65
+ exclude_weight_decay: false
66
+ exclude_weight_decay_conf: {}
67
+ fold_length:
68
+ - 160000
69
+ - 200
70
+ freeze_param: []
71
+ frontend: null
72
+ frontend_conf:
73
+ fs: 16k
74
+ grad_clip: 1
75
+ grad_clip_type: 2.0
76
+ grad_noise: false
77
+ gradient_as_bucket_view: true
78
+ ignore_init_mismatch: false
79
+ init: xavier_normal
80
+ init_param: []
81
+ input_size: 1
82
+ iterator_type: sequence
83
+ keep_nbest_models: 5
84
+ lightning_conf:
85
+ best_model_criterion:
86
+ - - valid/epoch_mAP
87
+ - max
88
+ - 1
89
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
90
+ devices: 1
91
+ log_every_n_steps: 250
92
+ max_epochs: 105
93
+ num_nodes: 1
94
+ strategy: ddp
95
+ strategy_conf:
96
+ find_unused_parameters: true
97
+ local_rank: null
98
+ log_interval: null
99
+ log_level: INFO
100
+ max_cache_fd: 32
101
+ max_cache_size: 0.0
102
+ max_epoch: 105
103
+ model: espnet
104
+ model_conf:
105
+ classification_type: multi-label
106
+ log_epoch_metrics: true
107
+ lsm_weight: 0.0
108
+ mixup_probability: 0.2
109
+ multi_task_dataset: false
110
+ multiple_iterator: false
111
+ multiprocessing_distributed: false
112
+ nbest_averaging_interval: 0
113
+ ngpu: 0
114
+ no_forward_run: false
115
+ normalize: null
116
+ normalize_conf: {}
117
+ num_att_plot: 0
118
+ num_cache_chunks: 1024
119
+ num_iters_per_epoch: null
120
+ num_workers: 8
121
+ optim: adamw
122
+ optim_conf:
123
+ betas:
124
+ - 0.9
125
+ - 0.98
126
+ lr: 3.0e-05
127
+ weight_decay: 0.01
128
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2
129
+ patience: null
130
+ preencoder: null
131
+ preencoder_conf: {}
132
+ pretrain_path: null
133
+ print_config: false
134
+ required:
135
+ - output_dir
136
+ - token_list
137
+ resume: true
138
+ save_strategy: all
139
+ scheduler: cosineannealingwarmuprestarts
140
+ scheduler_conf:
141
+ first_cycle_steps: 95000
142
+ max_lr: 3.0e-05
143
+ min_lr: 5.0e-06
144
+ warmup_steps: 8000
145
+ seed: 0
146
+ sharded_ddp: false
147
+ shuffle_within_batch: false
148
+ sort_batch: descending
149
+ sort_in_batch: descending
150
+ specaug: null
151
+ specaug_conf: {}
152
+ task: cls
153
+ text_bpemodel: null
154
+ text_encoder: null
155
+ text_encoder_conf: {}
156
+ text_token_list: null
157
+ token_list: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/fsd50k/token_list
158
+ train_data_path_and_name_and_type:
159
+ - !!python/tuple
160
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/wav.scp
161
+ - speech
162
+ - sound
163
+ - !!python/tuple
164
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/train/text
165
+ - label
166
+ - text
167
+ train_dtype: float32
168
+ train_shape_file:
169
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/speech_shape
170
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/train/label_shape
171
+ unused_parameters: true
172
+ use_adapter: false
173
+ use_amp: false
174
+ use_deepspeed: false
175
+ use_matplotlib: true
176
+ use_preprocessor: true
177
+ use_tensorboard: true
178
+ use_tf32: false
179
+ use_wandb: true
180
+ user_callbacks:
181
+ - mAP_logging
182
+ utt2weight_file: null
183
+ val_scheduler_criterion: !!python/tuple
184
+ - valid
185
+ - loss
186
+ valid_batch_bins: null
187
+ valid_batch_size: null
188
+ valid_batch_type: null
189
+ valid_data_path_and_name_and_type:
190
+ - !!python/tuple
191
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/wav.scp
192
+ - speech
193
+ - sound
194
+ - !!python/tuple
195
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/fsd50k/val/text
196
+ - label
197
+ - text
198
+ valid_iterator_type: null
199
+ valid_max_cache_size: null
200
+ valid_shape_file:
201
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/speech_shape
202
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_stats_16k/valid/label_shape
203
+ wandb_entity: shikhar
204
+ wandb_id: null
205
+ wandb_model_log_interval: -1
206
+ wandb_name: fsd50k.earbasei2
207
+ wandb_project: audioverse
208
+ write_collected_feats: false
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/fsd50k/cls_earbasei2/valid.epoch_mAP.ave_1best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90921e2db74c8209b4b06e9bccc3466a3fe0b41165bd82132675ca280da0dda0
3
+ size 362120410