Upload ChunkFormer Classification Model
Browse files- README.md +95 -0
- config.yaml +96 -0
- global_cmvn +1 -0
- label_mapping.json +30 -0
- pytorch_model.pt +3 -0
README.md
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- audio-classification
|
| 4 |
+
- speech-classification
|
| 5 |
+
- audio
|
| 6 |
+
- chunkformer
|
| 7 |
+
- pytorch
|
| 8 |
+
- transformers
|
| 9 |
+
- speech-processing
|
| 10 |
+
|
| 11 |
+
license: apache-2.0
|
| 12 |
+
library_name: transformers
|
| 13 |
+
pipeline_tag: audio-classification
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# ChunkFormer Classification Model
|
| 17 |
+
<style>
|
| 18 |
+
img {
|
| 19 |
+
display: inline;
|
| 20 |
+
}
|
| 21 |
+
</style>
|
| 22 |
+
[](https://github.com/khanld/chunkformer)
|
| 23 |
+
[](https://arxiv.org/abs/2502.14673)
|
| 24 |
+
|
| 25 |
+
This model performs speech classification tasks such as gender recognition, dialect identification, emotion detection, and age classification.
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
## Usage
|
| 29 |
+
|
| 30 |
+
Install the package:
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
pip install chunkformer
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### Single Audio Classification
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
from chunkformer import ChunkFormerModel
|
| 40 |
+
|
| 41 |
+
# Load the model
|
| 42 |
+
model = ChunkFormerModel.from_pretrained("khanhld/chunkformer-gender-emotion-dialect-age-classification")
|
| 43 |
+
|
| 44 |
+
# Classify a single audio file
|
| 45 |
+
result = model.classify_audio(
|
| 46 |
+
audio_path="path/to/your/audio.wav",
|
| 47 |
+
chunk_size=-1, # -1 for full attention
|
| 48 |
+
left_context_size=-1,
|
| 49 |
+
right_context_size=-1,
|
| 50 |
+
return_probabilities=True
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
print(result)
|
| 54 |
+
# Output example:
|
| 55 |
+
# {
|
| 56 |
+
# 'gender': 0,
|
| 57 |
+
# 'gender_probability': [0.95, 0.05],
|
| 58 |
+
# 'dialect': 3,
|
| 59 |
+
# 'dialect_probability': [0.1, 0.15, 0.05, 0.7],
|
| 60 |
+
# 'emotion': 5,
|
| 61 |
+
# 'emotion_probability': [0.05, 0.02, 0.03, 0.08, 0.02, 0.8]
|
| 62 |
+
# }
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Command Line Usage
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
chunkformer-decode \
|
| 69 |
+
--model_checkpoint khanhld/chunkformer-gender-emotion-dialect-age-classification \
|
| 70 |
+
--audio_file path/to/audio.wav \
|
| 71 |
+
--return_probabilities
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Training
|
| 75 |
+
|
| 76 |
+
This model was trained using the ChunkFormer framework. For more details about the training process and to access the source code, please visit: https://github.com/khanld/chunkformer
|
| 77 |
+
|
| 78 |
+
Paper: https://arxiv.org/abs/2502.14673
|
| 79 |
+
|
| 80 |
+
## Citation
|
| 81 |
+
|
| 82 |
+
If you use this work in your research, please cite:
|
| 83 |
+
|
| 84 |
+
```bibtex
|
| 85 |
+
@INPROCEEDINGS{10888640,
|
| 86 |
+
author={Le, Khanh and Ho, Tuan Vu and Tran, Dung and Chau, Duc Thanh},
|
| 87 |
+
booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
|
| 88 |
+
title={ChunkFormer: Masked Chunking Conformer For Long-Form Speech Transcription},
|
| 89 |
+
year={2025},
|
| 90 |
+
volume={},
|
| 91 |
+
number={},
|
| 92 |
+
pages={1-5},
|
| 93 |
+
keywords={Scalability;Memory management;Graphics processing units;Signal processing;Performance gain;Hardware;Resource management;Speech processing;Standards;Context modeling;chunkformer;masked batch;long-form transcription},
|
| 94 |
+
doi={10.1109/ICASSP49660.2025.10888640}}
|
| 95 |
+
```
|
config.yaml
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_grad: 1
|
| 2 |
+
cmvn: global_cmvn
|
| 3 |
+
cmvn_conf:
|
| 4 |
+
cmvn_file: data/train_hf/global_cmvn
|
| 5 |
+
is_json_cmvn: true
|
| 6 |
+
dataset: classification
|
| 7 |
+
dataset_conf:
|
| 8 |
+
batch_conf:
|
| 9 |
+
batch_size: 4
|
| 10 |
+
batch_type: dynamic
|
| 11 |
+
max_frames_in_batch: 80000
|
| 12 |
+
pad_feat: true
|
| 13 |
+
fbank_conf:
|
| 14 |
+
dither: 1.0
|
| 15 |
+
frame_length: 25
|
| 16 |
+
frame_shift: 10
|
| 17 |
+
num_mel_bins: 80
|
| 18 |
+
filter_conf:
|
| 19 |
+
max_length: 40960
|
| 20 |
+
min_length: 0
|
| 21 |
+
resample_conf:
|
| 22 |
+
resample_rate: 16000
|
| 23 |
+
shuffle: true
|
| 24 |
+
shuffle_conf:
|
| 25 |
+
shuffle_size: 1000
|
| 26 |
+
sort: false
|
| 27 |
+
sort_conf:
|
| 28 |
+
sort_size: 500
|
| 29 |
+
spec_aug: true
|
| 30 |
+
spec_aug_conf:
|
| 31 |
+
max_f: 10
|
| 32 |
+
max_t: 50
|
| 33 |
+
num_f_mask: 2
|
| 34 |
+
num_t_mask: 2
|
| 35 |
+
speed_perturb: true
|
| 36 |
+
tasks:
|
| 37 |
+
- gender
|
| 38 |
+
- emotion
|
| 39 |
+
- dialect
|
| 40 |
+
- age
|
| 41 |
+
dtype: fp16
|
| 42 |
+
encoder: chunkformer
|
| 43 |
+
encoder_conf:
|
| 44 |
+
activation_type: swish
|
| 45 |
+
attention_dropout_rate: 0.1
|
| 46 |
+
attention_heads: 4
|
| 47 |
+
cnn_module_kernel: 15
|
| 48 |
+
cnn_module_norm: layer_norm
|
| 49 |
+
dropout_rate: 0.1
|
| 50 |
+
dynamic_chunk_sizes:
|
| 51 |
+
- -1
|
| 52 |
+
- -1
|
| 53 |
+
- 64
|
| 54 |
+
- 128
|
| 55 |
+
- 256
|
| 56 |
+
dynamic_conv: true
|
| 57 |
+
dynamic_left_context_sizes:
|
| 58 |
+
- 64
|
| 59 |
+
- 128
|
| 60 |
+
- 256
|
| 61 |
+
dynamic_right_context_sizes:
|
| 62 |
+
- 64
|
| 63 |
+
- 128
|
| 64 |
+
- 256
|
| 65 |
+
input_layer: dw_striding
|
| 66 |
+
linear_units: 2048
|
| 67 |
+
normalize_before: true
|
| 68 |
+
num_blocks: 12
|
| 69 |
+
output_size: 512
|
| 70 |
+
pos_enc_layer_type: chunk_rel_pos
|
| 71 |
+
positional_dropout_rate: 0.1
|
| 72 |
+
selfattention_layer_type: chunk_rel_seflattn
|
| 73 |
+
use_cnn_module: true
|
| 74 |
+
grad_clip: 5.0
|
| 75 |
+
input_dim: 80
|
| 76 |
+
log_interval: 100
|
| 77 |
+
max_epoch: 100
|
| 78 |
+
model: classification
|
| 79 |
+
model_conf:
|
| 80 |
+
dropout_rate: 0.1
|
| 81 |
+
label_smoothing: 0.2
|
| 82 |
+
tasks:
|
| 83 |
+
age: 5
|
| 84 |
+
dialect: 5
|
| 85 |
+
emotion: 8
|
| 86 |
+
gender: 2
|
| 87 |
+
model_dir: exp/v1
|
| 88 |
+
optim: adamw
|
| 89 |
+
optim_conf:
|
| 90 |
+
lr: 0.001
|
| 91 |
+
save_states: model_only
|
| 92 |
+
scheduler: warmuplr
|
| 93 |
+
scheduler_conf:
|
| 94 |
+
warmup_steps: 5000
|
| 95 |
+
train_engine: torch_ddp
|
| 96 |
+
use_amp: true
|
global_cmvn
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"mean_stat": [261372048.0, 267103280.0, 294503520.0, 322519424.0, 348421568.0, 366889184.0, 377338816.0, 379566528.0, 379580672.0, 376285536.0, 378374208.0, 380432000.0, 385832768.0, 388648096.0, 387676416.0, 386573888.0, 383477728.0, 378764480.0, 378754240.0, 371668320.0, 366963648.0, 372021824.0, 366713792.0, 370329056.0, 368177056.0, 370016032.0, 367145248.0, 368462816.0, 367173312.0, 367278656.0, 367484224.0, 367187232.0, 366792800.0, 366821440.0, 367623840.0, 369116864.0, 370666048.0, 372521376.0, 374408384.0, 372320800.0, 374038592.0, 372046816.0, 373391168.0, 373426496.0, 374929760.0, 378237568.0, 382492800.0, 386041984.0, 387789728.0, 389485472.0, 389460288.0, 388519360.0, 387389120.0, 387256672.0, 386870976.0, 386530880.0, 385642112.0, 384694080.0, 384236736.0, 383496864.0, 383886464.0, 382380480.0, 379102080.0, 374611744.0, 369628832.0, 365862208.0, 364463200.0, 365086624.0, 365906560.0, 366363200.0, 367998496.0, 372256928.0, 373061216.0, 373929888.0, 377344832.0, 379676128.0, 380587552.0, 379426144.0, 374036064.0, 347577408.0], "var_stat": [2887069184.0, 3043122176.0, 3715460864.0, 4427841536.0, 5134542848.0, 5668194816.0, 5985690624.0, 6069530624.0, 6061555712.0, 5945719296.0, 5998680576.0, 6067827712.0, 6241057280.0, 6334793728.0, 6305166336.0, 6265669632.0, 6164521984.0, 6015184896.0, 6006290432.0, 5791601664.0, 5653274624.0, 5795687936.0, 5640153088.0, 5740518400.0, 5672561152.0, 5720160256.0, 5632040448.0, 5666395136.0, 5625992192.0, 5626292224.0, 5630668288.0, 5620961280.0, 5608001536.0, 5606545920.0, 5629255680.0, 5672909824.0, 5718445056.0, 5770887680.0, 5823025664.0, 5759273984.0, 5805913088.0, 5744293376.0, 5779486208.0, 5778481664.0, 5822397440.0, 5919054336.0, 6046029824.0, 6155350528.0, 6211739648.0, 6262714368.0, 6258859520.0, 6227259392.0, 6191663616.0, 6187784704.0, 6172819456.0, 6157926912.0, 6129428992.0, 6104130560.0, 6093665280.0, 6077194752.0, 6099302912.0, 6067632128.0, 5972396544.0, 5828241408.0, 5670683648.0, 5549988864.0, 5504571392.0, 5522436096.0, 5542863872.0, 5552516608.0, 5590664704.0, 5696314368.0, 5711076864.0, 5739663872.0, 5839644160.0, 5908163072.0, 5928923648.0, 5888913408.0, 5721562624.0, 4925175296.0], "frame_num": 25476743}
|
label_mapping.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"gender": {
|
| 3 |
+
"female": 0,
|
| 4 |
+
"male": 1
|
| 5 |
+
},
|
| 6 |
+
"dialect": {
|
| 7 |
+
"central dialect": 0,
|
| 8 |
+
"highland central dialect": 1,
|
| 9 |
+
"minority ethnic group dialect": 2,
|
| 10 |
+
"northern dialect": 3,
|
| 11 |
+
"southern dialect": 4
|
| 12 |
+
},
|
| 13 |
+
"emotion": {
|
| 14 |
+
"angry": 0,
|
| 15 |
+
"disgust": 1,
|
| 16 |
+
"fear": 2,
|
| 17 |
+
"happy": 3,
|
| 18 |
+
"joyful": 4,
|
| 19 |
+
"neutral": 5,
|
| 20 |
+
"sad": 6,
|
| 21 |
+
"tired": 7
|
| 22 |
+
},
|
| 23 |
+
"age": {
|
| 24 |
+
"children": 0,
|
| 25 |
+
"middle age": 1,
|
| 26 |
+
"old": 2,
|
| 27 |
+
"very old": 3,
|
| 28 |
+
"young": 4
|
| 29 |
+
}
|
| 30 |
+
}
|
pytorch_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9756b07e93d0367f4f1b5045d0e189065fca81c848d6976513fce9f60c31ef70
|
| 3 |
+
size 314935026
|