Upload ChunkFormer Classification Model

Browse files

Files changed (5) hide show

README.md +95 -0
config.yaml +96 -0
global_cmvn +1 -0
label_mapping.json +30 -0
pytorch_model.pt +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+---
+tags:
+- audio-classification
+- speech-classification
+- audio
+- chunkformer
+- pytorch
+- transformers
+- speech-processing
+license: apache-2.0
+library_name: transformers
+pipeline_tag: audio-classification
+---
+# ChunkFormer Classification Model
+<style>
+img {
+display: inline;
+}
+</style>
+[![GitHub](https://img.shields.io/badge/GitHub-ChunkFormer-blue)](https://github.com/khanld/chunkformer)
+[![Paper](https://img.shields.io/badge/Paper-ICASSP%202025-green)](https://arxiv.org/abs/2502.14673)
+This model performs speech classification tasks such as gender recognition, dialect identification, emotion detection, and age classification.
+## Usage
+Install the package:
+```bash
+pip install chunkformer
+```
+### Single Audio Classification
+```python
+from chunkformer import ChunkFormerModel
+# Load the model
+model = ChunkFormerModel.from_pretrained("khanhld/chunkformer-gender-emotion-dialect-age-classification")
+# Classify a single audio file
+result = model.classify_audio(
+    audio_path="path/to/your/audio.wav",
+    chunk_size=-1,  # -1 for full attention
+    left_context_size=-1,
+    right_context_size=-1,
+    return_probabilities=True
+)
+print(result)
+# Output example:
+# {
+#   'gender': 0,
+#   'gender_probability': [0.95, 0.05],
+#   'dialect': 3,
+#   'dialect_probability': [0.1, 0.15, 0.05, 0.7],
+#   'emotion': 5,
+#   'emotion_probability': [0.05, 0.02, 0.03, 0.08, 0.02, 0.8]
+# }
+```
+### Command Line Usage
+```bash
+chunkformer-decode \
+    --model_checkpoint khanhld/chunkformer-gender-emotion-dialect-age-classification \
+    --audio_file path/to/audio.wav \
+    --return_probabilities
+```
+## Training
+This model was trained using the ChunkFormer framework. For more details about the training process and to access the source code, please visit: https://github.com/khanld/chunkformer
+Paper: https://arxiv.org/abs/2502.14673
+## Citation
+If you use this work in your research, please cite:
+```bibtex
+@INPROCEEDINGS{10888640,
+    author={Le, Khanh and Ho, Tuan Vu and Tran, Dung and Chau, Duc Thanh},
+    booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+    title={ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription},
+    year={2025},
+    volume={},
+    number={},
+    pages={1-5},
+    keywords={Scalability;Memory management;Graphics processing units;Signal processing;Performance gain;Hardware;Resource management;Speech processing;Standards;Context modeling;chunkformer;masked batch;long-form transcription},
+    doi={10.1109/ICASSP49660.2025.10888640}}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+accum_grad: 1
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: data/train_hf/global_cmvn
+  is_json_cmvn: true
+dataset: classification
+dataset_conf:
+  batch_conf:
+    batch_size: 4
+    batch_type: dynamic
+    max_frames_in_batch: 80000
+    pad_feat: true
+  fbank_conf:
+    dither: 1.0
+    frame_length: 25
+    frame_shift: 10
+    num_mel_bins: 80
+  filter_conf:
+    max_length: 40960
+    min_length: 0
+  resample_conf:
+    resample_rate: 16000
+  shuffle: true
+  shuffle_conf:
+    shuffle_size: 1000
+  sort: false
+  sort_conf:
+    sort_size: 500
+  spec_aug: true
+  spec_aug_conf:
+    max_f: 10
+    max_t: 50
+    num_f_mask: 2
+    num_t_mask: 2
+  speed_perturb: true
+  tasks:
+  - gender
+  - emotion
+  - dialect
+  - age
+dtype: fp16
+encoder: chunkformer
+encoder_conf:
+  activation_type: swish
+  attention_dropout_rate: 0.1
+  attention_heads: 4
+  cnn_module_kernel: 15
+  cnn_module_norm: layer_norm
+  dropout_rate: 0.1
+  dynamic_chunk_sizes:
+  - -1
+  - -1
+  - 64
+  - 128
+  - 256
+  dynamic_conv: true
+  dynamic_left_context_sizes:
+  - 64
+  - 128
+  - 256
+  dynamic_right_context_sizes:
+  - 64
+  - 128
+  - 256
+  input_layer: dw_striding
+  linear_units: 2048
+  normalize_before: true
+  num_blocks: 12
+  output_size: 512
+  pos_enc_layer_type: chunk_rel_pos
+  positional_dropout_rate: 0.1
+  selfattention_layer_type: chunk_rel_seflattn
+  use_cnn_module: true
+grad_clip: 5.0
+input_dim: 80
+log_interval: 100
+max_epoch: 100
+model: classification
+model_conf:
+  dropout_rate: 0.1
+  label_smoothing: 0.2
+  tasks:
+    age: 5
+    dialect: 5
+    emotion: 8
+    gender: 2
+model_dir: exp/v1
+optim: adamw
+optim_conf:
+  lr: 0.001
+save_states: model_only
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 5000
+train_engine: torch_ddp
+use_amp: true

global_cmvn ADDED Viewed

	@@ -0,0 +1 @@

+ {"mean_stat": [261372048.0, 267103280.0, 294503520.0, 322519424.0, 348421568.0, 366889184.0, 377338816.0, 379566528.0, 379580672.0, 376285536.0, 378374208.0, 380432000.0, 385832768.0, 388648096.0, 387676416.0, 386573888.0, 383477728.0, 378764480.0, 378754240.0, 371668320.0, 366963648.0, 372021824.0, 366713792.0, 370329056.0, 368177056.0, 370016032.0, 367145248.0, 368462816.0, 367173312.0, 367278656.0, 367484224.0, 367187232.0, 366792800.0, 366821440.0, 367623840.0, 369116864.0, 370666048.0, 372521376.0, 374408384.0, 372320800.0, 374038592.0, 372046816.0, 373391168.0, 373426496.0, 374929760.0, 378237568.0, 382492800.0, 386041984.0, 387789728.0, 389485472.0, 389460288.0, 388519360.0, 387389120.0, 387256672.0, 386870976.0, 386530880.0, 385642112.0, 384694080.0, 384236736.0, 383496864.0, 383886464.0, 382380480.0, 379102080.0, 374611744.0, 369628832.0, 365862208.0, 364463200.0, 365086624.0, 365906560.0, 366363200.0, 367998496.0, 372256928.0, 373061216.0, 373929888.0, 377344832.0, 379676128.0, 380587552.0, 379426144.0, 374036064.0, 347577408.0], "var_stat": [2887069184.0, 3043122176.0, 3715460864.0, 4427841536.0, 5134542848.0, 5668194816.0, 5985690624.0, 6069530624.0, 6061555712.0, 5945719296.0, 5998680576.0, 6067827712.0, 6241057280.0, 6334793728.0, 6305166336.0, 6265669632.0, 6164521984.0, 6015184896.0, 6006290432.0, 5791601664.0, 5653274624.0, 5795687936.0, 5640153088.0, 5740518400.0, 5672561152.0, 5720160256.0, 5632040448.0, 5666395136.0, 5625992192.0, 5626292224.0, 5630668288.0, 5620961280.0, 5608001536.0, 5606545920.0, 5629255680.0, 5672909824.0, 5718445056.0, 5770887680.0, 5823025664.0, 5759273984.0, 5805913088.0, 5744293376.0, 5779486208.0, 5778481664.0, 5822397440.0, 5919054336.0, 6046029824.0, 6155350528.0, 6211739648.0, 6262714368.0, 6258859520.0, 6227259392.0, 6191663616.0, 6187784704.0, 6172819456.0, 6157926912.0, 6129428992.0, 6104130560.0, 6093665280.0, 6077194752.0, 6099302912.0, 6067632128.0, 5972396544.0, 5828241408.0, 5670683648.0, 5549988864.0, 5504571392.0, 5522436096.0, 5542863872.0, 5552516608.0, 5590664704.0, 5696314368.0, 5711076864.0, 5739663872.0, 5839644160.0, 5908163072.0, 5928923648.0, 5888913408.0, 5721562624.0, 4925175296.0], "frame_num": 25476743}

label_mapping.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "gender": {
+    "female": 0,
+    "male": 1
+  },
+  "dialect": {
+    "central dialect": 0,
+    "highland central dialect": 1,
+    "minority ethnic group dialect": 2,
+    "northern dialect": 3,
+    "southern dialect": 4
+  },
+  "emotion": {
+    "angry": 0,
+    "disgust": 1,
+    "fear": 2,
+    "happy": 3,
+    "joyful": 4,
+    "neutral": 5,
+    "sad": 6,
+    "tired": 7
+  },
+  "age": {
+    "children": 0,
+    "middle age": 1,
+    "old": 2,
+    "very old": 3,
+    "young": 4
+  }
+}

pytorch_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9756b07e93d0367f4f1b5045d0e189065fca81c848d6976513fce9f60c31ef70
+size 314935026