trfms-integration (#2)
Browse files- trfms integration (2f462c276eba6866cd5d5731228a533a145aade3)
- readme update (ede40faa7357d6ca81220205015e088a4bd7b129)
- readme update (b6353ea1b1d2db6c36b90532d1e127a10aed6f52)
- readme update (431fe8681e1c7875a50f5c5815057738725d3f2e)
- readme update (8da514f9f850549f3016842882a52e040933e898)
Co-authored-by: Eustache Le Bihan <[email protected]>
- README.md +76 -2
- config.json +15 -28
- model.safetensors +2 -2
- preprocessor_config.json +3 -10
- special_tokens_map.json +16 -0
- tokenizer.json +0 -0
- tokenizer_config.json +11 -3
README.md
CHANGED
|
@@ -18,6 +18,7 @@ datasets:
|
|
| 18 |
- MLCommons/peoples_speech
|
| 19 |
thumbnail: null
|
| 20 |
tags:
|
|
|
|
| 21 |
- automatic-speech-recognition
|
| 22 |
- speech
|
| 23 |
- audio
|
|
@@ -191,7 +192,7 @@ pip install nemo_toolkit['all']
|
|
| 191 |
|
| 192 |
## How to Use this Model
|
| 193 |
|
| 194 |
-
The model is available for use in the NeMo toolkit [3], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset.
|
| 195 |
|
| 196 |
### Automatically instantiate the model
|
| 197 |
|
|
@@ -200,7 +201,7 @@ import nemo.collections.asr as nemo_asr
|
|
| 200 |
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-1.1b")
|
| 201 |
```
|
| 202 |
|
| 203 |
-
### Transcribing using
|
| 204 |
First, let's get a sample
|
| 205 |
```
|
| 206 |
wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
|
@@ -210,6 +211,79 @@ Then simply do:
|
|
| 210 |
asr_model.transcribe(['2086-149220-0033.wav'])
|
| 211 |
```
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
### Transcribing many audio files
|
| 214 |
|
| 215 |
```shell
|
|
|
|
| 18 |
- MLCommons/peoples_speech
|
| 19 |
thumbnail: null
|
| 20 |
tags:
|
| 21 |
+
- transformers
|
| 22 |
- automatic-speech-recognition
|
| 23 |
- speech
|
| 24 |
- audio
|
|
|
|
| 192 |
|
| 193 |
## How to Use this Model
|
| 194 |
|
| 195 |
+
The model is available for use in the NeMo toolkit [3], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset. Moreover, you can now run Parakeet CTC natively with [Transformers](https://github.com/huggingface/transformers) 🤗.
|
| 196 |
|
| 197 |
### Automatically instantiate the model
|
| 198 |
|
|
|
|
| 201 |
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-1.1b")
|
| 202 |
```
|
| 203 |
|
| 204 |
+
### Transcribing using NeMo
|
| 205 |
First, let's get a sample
|
| 206 |
```
|
| 207 |
wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
|
|
|
| 211 |
asr_model.transcribe(['2086-149220-0033.wav'])
|
| 212 |
```
|
| 213 |
|
| 214 |
+
### Transcribing using [Transformers](https://github.com/huggingface/transformers) 🤗
|
| 215 |
+
|
| 216 |
+
Make sure to install `transformers` from source.
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
pip install git+https://github.com/huggingface/transformers
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
<details>
|
| 223 |
+
<summary>➡️ Pipeline usage</summary>
|
| 224 |
+
|
| 225 |
+
```python
|
| 226 |
+
from transformers import pipeline
|
| 227 |
+
|
| 228 |
+
pipe = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-1.1b")
|
| 229 |
+
out = pipe("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")
|
| 230 |
+
print(out)
|
| 231 |
+
```
|
| 232 |
+
</details>
|
| 233 |
+
|
| 234 |
+
<details>
|
| 235 |
+
<summary>➡️ AutoModel</summary>
|
| 236 |
+
|
| 237 |
+
```python
|
| 238 |
+
from transformers import AutoModelForCTC, AutoProcessor
|
| 239 |
+
from datasets import load_dataset, Audio
|
| 240 |
+
import torch
|
| 241 |
+
|
| 242 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 243 |
+
|
| 244 |
+
processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b")
|
| 245 |
+
model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device)
|
| 246 |
+
|
| 247 |
+
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
| 248 |
+
ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
|
| 249 |
+
speech_samples = [el['array'] for el in ds["audio"][:5]]
|
| 250 |
+
|
| 251 |
+
inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
|
| 252 |
+
inputs.to(model.device, dtype=model.dtype)
|
| 253 |
+
outputs = model.generate(**inputs)
|
| 254 |
+
print(processor.batch_decode(outputs))
|
| 255 |
+
```
|
| 256 |
+
</details>
|
| 257 |
+
|
| 258 |
+
<details>
|
| 259 |
+
<summary>➡️ Training</summary>
|
| 260 |
+
|
| 261 |
+
```python
|
| 262 |
+
from transformers import AutoModelForCTC, AutoProcessor
|
| 263 |
+
from datasets import load_dataset, Audio
|
| 264 |
+
import torch
|
| 265 |
+
|
| 266 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 267 |
+
|
| 268 |
+
processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-1.1b")
|
| 269 |
+
model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-1.1b", dtype="auto", device_map=device)
|
| 270 |
+
|
| 271 |
+
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
| 272 |
+
ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
|
| 273 |
+
speech_samples = [el['array'] for el in ds["audio"][:5]]
|
| 274 |
+
text_samples = [el for el in ds["text"][:5]]
|
| 275 |
+
|
| 276 |
+
# passing `text` to the processor will prepare inputs' `labels` key
|
| 277 |
+
inputs = processor(audio=speech_samples, text=text_samples, sampling_rate=processor.feature_extractor.sampling_rate)
|
| 278 |
+
inputs.to(device, dtype=model.dtype)
|
| 279 |
+
|
| 280 |
+
outputs = model(**inputs)
|
| 281 |
+
outputs.loss.backward()
|
| 282 |
+
```
|
| 283 |
+
</details>
|
| 284 |
+
|
| 285 |
+
For more details about usage, the refer to [Transformers' documentation](https://huggingface.co/docs/transformers/en/index).
|
| 286 |
+
|
| 287 |
### Transcribing many audio files
|
| 288 |
|
| 289 |
```shell
|
config.json
CHANGED
|
@@ -1,50 +1,37 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
-
"blank_token_id": 1024,
|
| 6 |
-
"bos_token_id": 1,
|
| 7 |
"ctc_loss_reduction": "mean",
|
| 8 |
"ctc_zero_infinity": true,
|
|
|
|
| 9 |
"encoder_config": {
|
| 10 |
"activation_dropout": 0.1,
|
| 11 |
-
"
|
| 12 |
-
"architectures": [
|
| 13 |
-
"FastConformerModel"
|
| 14 |
-
],
|
| 15 |
"attention_dropout": 0.1,
|
| 16 |
-
"attention_probs_dropout_prob": 0.1,
|
| 17 |
"conv_kernel_size": 9,
|
| 18 |
-
"d_model": 1024,
|
| 19 |
"dropout": 0.1,
|
| 20 |
-
"
|
| 21 |
-
"encoder_attention_heads": 8,
|
| 22 |
-
"encoder_ffn_dim": 4096,
|
| 23 |
-
"encoder_layerdrop": 0.1,
|
| 24 |
-
"encoder_layers": 42,
|
| 25 |
"hidden_act": "silu",
|
| 26 |
-
"hidden_dropout_prob": 0.1,
|
| 27 |
"hidden_size": 1024,
|
| 28 |
"initializer_range": 0.02,
|
| 29 |
"intermediate_size": 4096,
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"nemo_model_type": "parakeet_ctc",
|
| 34 |
"num_attention_heads": 8,
|
| 35 |
"num_hidden_layers": 42,
|
|
|
|
| 36 |
"num_mel_bins": 80,
|
|
|
|
| 37 |
"subsampling_conv_channels": 256,
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
-
"vocab_size": 1024,
|
| 42 |
-
"xscaling": true
|
| 43 |
},
|
| 44 |
-
"
|
| 45 |
"model_type": "parakeet_ctc",
|
| 46 |
-
"pad_token_id":
|
| 47 |
-
"
|
| 48 |
-
"transformers_version": "4.54.0.dev0",
|
| 49 |
"vocab_size": 1025
|
| 50 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"ParakeetForCTC"
|
| 4 |
],
|
|
|
|
|
|
|
| 5 |
"ctc_loss_reduction": "mean",
|
| 6 |
"ctc_zero_infinity": true,
|
| 7 |
+
"dtype": "bfloat16",
|
| 8 |
"encoder_config": {
|
| 9 |
"activation_dropout": 0.1,
|
| 10 |
+
"attention_bias": true,
|
|
|
|
|
|
|
|
|
|
| 11 |
"attention_dropout": 0.1,
|
|
|
|
| 12 |
"conv_kernel_size": 9,
|
|
|
|
| 13 |
"dropout": 0.1,
|
| 14 |
+
"dropout_positions": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"hidden_act": "silu",
|
|
|
|
| 16 |
"hidden_size": 1024,
|
| 17 |
"initializer_range": 0.02,
|
| 18 |
"intermediate_size": 4096,
|
| 19 |
+
"layerdrop": 0.1,
|
| 20 |
+
"max_position_embeddings": 5000,
|
| 21 |
+
"model_type": "parakeet_encoder",
|
|
|
|
| 22 |
"num_attention_heads": 8,
|
| 23 |
"num_hidden_layers": 42,
|
| 24 |
+
"num_key_value_heads": 8,
|
| 25 |
"num_mel_bins": 80,
|
| 26 |
+
"scale_input": true,
|
| 27 |
"subsampling_conv_channels": 256,
|
| 28 |
+
"subsampling_conv_kernel_size": 3,
|
| 29 |
+
"subsampling_conv_stride": 2,
|
| 30 |
+
"subsampling_factor": 8
|
|
|
|
|
|
|
| 31 |
},
|
| 32 |
+
"initializer_range": 0.02,
|
| 33 |
"model_type": "parakeet_ctc",
|
| 34 |
+
"pad_token_id": 1024,
|
| 35 |
+
"transformers_version": "4.57.0.dev0",
|
|
|
|
| 36 |
"vocab_size": 1025
|
| 37 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57e0bc26772f3360b7ae0c087f184364179906674d08fc8b71d48a54d4f52145
|
| 3 |
+
size 4250698604
|
preprocessor_config.json
CHANGED
|
@@ -1,20 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
"f_min": 0,
|
| 4 |
-
"feature_extractor_type": "FastConformerFeatureExtractor",
|
| 5 |
"feature_size": 80,
|
| 6 |
"hop_length": 160,
|
| 7 |
-
"mag_power": 2.0,
|
| 8 |
-
"mel_scale": "slaney",
|
| 9 |
"n_fft": 512,
|
| 10 |
-
"n_mels": 80,
|
| 11 |
-
"normalize": "per_feature",
|
| 12 |
"padding_side": "right",
|
| 13 |
"padding_value": 0.0,
|
| 14 |
"preemphasis": 0.97,
|
|
|
|
| 15 |
"return_attention_mask": true,
|
| 16 |
"sampling_rate": 16000,
|
| 17 |
-
"win_length": 400
|
| 18 |
-
"window_size": 0.025,
|
| 19 |
-
"window_stride": 0.01
|
| 20 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"feature_extractor_type": "ParakeetFeatureExtractor",
|
|
|
|
|
|
|
| 3 |
"feature_size": 80,
|
| 4 |
"hop_length": 160,
|
|
|
|
|
|
|
| 5 |
"n_fft": 512,
|
|
|
|
|
|
|
| 6 |
"padding_side": "right",
|
| 7 |
"padding_value": 0.0,
|
| 8 |
"preemphasis": 0.97,
|
| 9 |
+
"processor_class": "ParakeetProcessor",
|
| 10 |
"return_attention_mask": true,
|
| 11 |
"sampling_rate": 16000,
|
| 12 |
+
"win_length": 400
|
|
|
|
|
|
|
| 13 |
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"pad_token": {
|
| 3 |
+
"content": "<pad>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"unk_token": {
|
| 10 |
+
"content": "<unk>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
}
|
| 16 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
CHANGED
|
@@ -7,13 +7,21 @@
|
|
| 7 |
"rstrip": false,
|
| 8 |
"single_word": false,
|
| 9 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
}
|
| 11 |
},
|
| 12 |
-
"blank_token_id": 1024,
|
| 13 |
"clean_up_tokenization_spaces": false,
|
| 14 |
-
"do_lower_case": false,
|
| 15 |
"extra_special_tokens": {},
|
| 16 |
"model_max_length": 1000000000000000019884624838656,
|
| 17 |
-
"
|
|
|
|
|
|
|
| 18 |
"unk_token": "<unk>"
|
| 19 |
}
|
|
|
|
| 7 |
"rstrip": false,
|
| 8 |
"single_word": false,
|
| 9 |
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1024": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
}
|
| 19 |
},
|
|
|
|
| 20 |
"clean_up_tokenization_spaces": false,
|
|
|
|
| 21 |
"extra_special_tokens": {},
|
| 22 |
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"pad_token": "<pad>",
|
| 24 |
+
"processor_class": "ParakeetProcessor",
|
| 25 |
+
"tokenizer_class": "ParakeetTokenizerFast",
|
| 26 |
"unk_token": "<unk>"
|
| 27 |
}
|