khanhld commited on
Commit
360da7f
·
verified ·
1 Parent(s): cf4bdb6

Upload ChunkFormer Classification Model

Browse files
Files changed (5) hide show
  1. README.md +95 -0
  2. config.yaml +96 -0
  3. global_cmvn +1 -0
  4. label_mapping.json +30 -0
  5. pytorch_model.pt +3 -0
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - audio-classification
4
+ - speech-classification
5
+ - audio
6
+ - chunkformer
7
+ - pytorch
8
+ - transformers
9
+ - speech-processing
10
+
11
+ license: apache-2.0
12
+ library_name: transformers
13
+ pipeline_tag: audio-classification
14
+ ---
15
+
16
+ # ChunkFormer Classification Model
17
+ <style>
18
+ img {
19
+ display: inline;
20
+ }
21
+ </style>
22
+ [![GitHub](https://img.shields.io/badge/GitHub-ChunkFormer-blue)](https://github.com/khanld/chunkformer)
23
+ [![Paper](https://img.shields.io/badge/Paper-ICASSP%202025-green)](https://arxiv.org/abs/2502.14673)
24
+
25
+ This model performs speech classification tasks such as gender recognition, dialect identification, emotion detection, and age classification.
26
+
27
+
28
+ ## Usage
29
+
30
+ Install the package:
31
+
32
+ ```bash
33
+ pip install chunkformer
34
+ ```
35
+
36
+ ### Single Audio Classification
37
+
38
+ ```python
39
+ from chunkformer import ChunkFormerModel
40
+
41
+ # Load the model
42
+ model = ChunkFormerModel.from_pretrained("khanhld/chunkformer-gender-emotion-dialect-age-classification")
43
+
44
+ # Classify a single audio file
45
+ result = model.classify_audio(
46
+ audio_path="path/to/your/audio.wav",
47
+ chunk_size=-1, # -1 for full attention
48
+ left_context_size=-1,
49
+ right_context_size=-1,
50
+ return_probabilities=True
51
+ )
52
+
53
+ print(result)
54
+ # Output example:
55
+ # {
56
+ # 'gender': 0,
57
+ # 'gender_probability': [0.95, 0.05],
58
+ # 'dialect': 3,
59
+ # 'dialect_probability': [0.1, 0.15, 0.05, 0.7],
60
+ # 'emotion': 5,
61
+ # 'emotion_probability': [0.05, 0.02, 0.03, 0.08, 0.02, 0.8]
62
+ # }
63
+ ```
64
+
65
+ ### Command Line Usage
66
+
67
+ ```bash
68
+ chunkformer-decode \
69
+ --model_checkpoint khanhld/chunkformer-gender-emotion-dialect-age-classification \
70
+ --audio_file path/to/audio.wav \
71
+ --return_probabilities
72
+ ```
73
+
74
+ ## Training
75
+
76
+ This model was trained using the ChunkFormer framework. For more details about the training process and to access the source code, please visit: https://github.com/khanld/chunkformer
77
+
78
+ Paper: https://arxiv.org/abs/2502.14673
79
+
80
+ ## Citation
81
+
82
+ If you use this work in your research, please cite:
83
+
84
+ ```bibtex
85
+ @INPROCEEDINGS{10888640,
86
+ author={Le, Khanh and Ho, Tuan Vu and Tran, Dung and Chau, Duc Thanh},
87
+ booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
88
+ title={ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription},
89
+ year={2025},
90
+ volume={},
91
+ number={},
92
+ pages={1-5},
93
+ keywords={Scalability;Memory management;Graphics processing units;Signal processing;Performance gain;Hardware;Resource management;Speech processing;Standards;Context modeling;chunkformer;masked batch;long-form transcription},
94
+ doi={10.1109/ICASSP49660.2025.10888640}}
95
+ ```
config.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 1
2
+ cmvn: global_cmvn
3
+ cmvn_conf:
4
+ cmvn_file: data/train_hf/global_cmvn
5
+ is_json_cmvn: true
6
+ dataset: classification
7
+ dataset_conf:
8
+ batch_conf:
9
+ batch_size: 4
10
+ batch_type: dynamic
11
+ max_frames_in_batch: 80000
12
+ pad_feat: true
13
+ fbank_conf:
14
+ dither: 1.0
15
+ frame_length: 25
16
+ frame_shift: 10
17
+ num_mel_bins: 80
18
+ filter_conf:
19
+ max_length: 40960
20
+ min_length: 0
21
+ resample_conf:
22
+ resample_rate: 16000
23
+ shuffle: true
24
+ shuffle_conf:
25
+ shuffle_size: 1000
26
+ sort: false
27
+ sort_conf:
28
+ sort_size: 500
29
+ spec_aug: true
30
+ spec_aug_conf:
31
+ max_f: 10
32
+ max_t: 50
33
+ num_f_mask: 2
34
+ num_t_mask: 2
35
+ speed_perturb: true
36
+ tasks:
37
+ - gender
38
+ - emotion
39
+ - dialect
40
+ - age
41
+ dtype: fp16
42
+ encoder: chunkformer
43
+ encoder_conf:
44
+ activation_type: swish
45
+ attention_dropout_rate: 0.1
46
+ attention_heads: 4
47
+ cnn_module_kernel: 15
48
+ cnn_module_norm: layer_norm
49
+ dropout_rate: 0.1
50
+ dynamic_chunk_sizes:
51
+ - -1
52
+ - -1
53
+ - 64
54
+ - 128
55
+ - 256
56
+ dynamic_conv: true
57
+ dynamic_left_context_sizes:
58
+ - 64
59
+ - 128
60
+ - 256
61
+ dynamic_right_context_sizes:
62
+ - 64
63
+ - 128
64
+ - 256
65
+ input_layer: dw_striding
66
+ linear_units: 2048
67
+ normalize_before: true
68
+ num_blocks: 12
69
+ output_size: 512
70
+ pos_enc_layer_type: chunk_rel_pos
71
+ positional_dropout_rate: 0.1
72
+ selfattention_layer_type: chunk_rel_seflattn
73
+ use_cnn_module: true
74
+ grad_clip: 5.0
75
+ input_dim: 80
76
+ log_interval: 100
77
+ max_epoch: 100
78
+ model: classification
79
+ model_conf:
80
+ dropout_rate: 0.1
81
+ label_smoothing: 0.2
82
+ tasks:
83
+ age: 5
84
+ dialect: 5
85
+ emotion: 8
86
+ gender: 2
87
+ model_dir: exp/v1
88
+ optim: adamw
89
+ optim_conf:
90
+ lr: 0.001
91
+ save_states: model_only
92
+ scheduler: warmuplr
93
+ scheduler_conf:
94
+ warmup_steps: 5000
95
+ train_engine: torch_ddp
96
+ use_amp: true
global_cmvn ADDED
@@ -0,0 +1 @@
 
 
1
+ {"mean_stat": [261372048.0, 267103280.0, 294503520.0, 322519424.0, 348421568.0, 366889184.0, 377338816.0, 379566528.0, 379580672.0, 376285536.0, 378374208.0, 380432000.0, 385832768.0, 388648096.0, 387676416.0, 386573888.0, 383477728.0, 378764480.0, 378754240.0, 371668320.0, 366963648.0, 372021824.0, 366713792.0, 370329056.0, 368177056.0, 370016032.0, 367145248.0, 368462816.0, 367173312.0, 367278656.0, 367484224.0, 367187232.0, 366792800.0, 366821440.0, 367623840.0, 369116864.0, 370666048.0, 372521376.0, 374408384.0, 372320800.0, 374038592.0, 372046816.0, 373391168.0, 373426496.0, 374929760.0, 378237568.0, 382492800.0, 386041984.0, 387789728.0, 389485472.0, 389460288.0, 388519360.0, 387389120.0, 387256672.0, 386870976.0, 386530880.0, 385642112.0, 384694080.0, 384236736.0, 383496864.0, 383886464.0, 382380480.0, 379102080.0, 374611744.0, 369628832.0, 365862208.0, 364463200.0, 365086624.0, 365906560.0, 366363200.0, 367998496.0, 372256928.0, 373061216.0, 373929888.0, 377344832.0, 379676128.0, 380587552.0, 379426144.0, 374036064.0, 347577408.0], "var_stat": [2887069184.0, 3043122176.0, 3715460864.0, 4427841536.0, 5134542848.0, 5668194816.0, 5985690624.0, 6069530624.0, 6061555712.0, 5945719296.0, 5998680576.0, 6067827712.0, 6241057280.0, 6334793728.0, 6305166336.0, 6265669632.0, 6164521984.0, 6015184896.0, 6006290432.0, 5791601664.0, 5653274624.0, 5795687936.0, 5640153088.0, 5740518400.0, 5672561152.0, 5720160256.0, 5632040448.0, 5666395136.0, 5625992192.0, 5626292224.0, 5630668288.0, 5620961280.0, 5608001536.0, 5606545920.0, 5629255680.0, 5672909824.0, 5718445056.0, 5770887680.0, 5823025664.0, 5759273984.0, 5805913088.0, 5744293376.0, 5779486208.0, 5778481664.0, 5822397440.0, 5919054336.0, 6046029824.0, 6155350528.0, 6211739648.0, 6262714368.0, 6258859520.0, 6227259392.0, 6191663616.0, 6187784704.0, 6172819456.0, 6157926912.0, 6129428992.0, 6104130560.0, 6093665280.0, 6077194752.0, 6099302912.0, 6067632128.0, 5972396544.0, 5828241408.0, 5670683648.0, 5549988864.0, 5504571392.0, 5522436096.0, 5542863872.0, 5552516608.0, 5590664704.0, 5696314368.0, 5711076864.0, 5739663872.0, 5839644160.0, 5908163072.0, 5928923648.0, 5888913408.0, 5721562624.0, 4925175296.0], "frame_num": 25476743}
label_mapping.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gender": {
3
+ "female": 0,
4
+ "male": 1
5
+ },
6
+ "dialect": {
7
+ "central dialect": 0,
8
+ "highland central dialect": 1,
9
+ "minority ethnic group dialect": 2,
10
+ "northern dialect": 3,
11
+ "southern dialect": 4
12
+ },
13
+ "emotion": {
14
+ "angry": 0,
15
+ "disgust": 1,
16
+ "fear": 2,
17
+ "happy": 3,
18
+ "joyful": 4,
19
+ "neutral": 5,
20
+ "sad": 6,
21
+ "tired": 7
22
+ },
23
+ "age": {
24
+ "children": 0,
25
+ "middle age": 1,
26
+ "old": 2,
27
+ "very old": 3,
28
+ "young": 4
29
+ }
30
+ }
pytorch_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9756b07e93d0367f4f1b5045d0e189065fca81c848d6976513fce9f60c31ef70
3
+ size 314935026