trfms-integration (#3)
Browse files- trfs integration (31083d9ebf98f72055597970738f24a6644d90cd)
- readme update (1d5178a0d33d436744976db4448f73be16d0bfdf)
- readme update (e421a862718ad77e23463222b0644b8086fcf2ef)
- readme update (0090acf9edb289ef455a8c284af6325f4608b3f2)
- readme update (76d150f3f15da56e290b47660b4e9daaea0e83cb)
Co-authored-by: Eustache Le Bihan <[email protected]>
- README.md +76 -2
- config.json +37 -0
- model.safetensors +3 -0
- preprocessor_config.json +13 -0
- special_tokens_map.json +16 -0
- tokenizer.json +0 -0
- tokenizer_config.json +27 -0
    	
        README.md
    CHANGED
    
    | @@ -18,6 +18,7 @@ datasets: | |
| 18 | 
             
            - MLCommons/peoples_speech
         | 
| 19 | 
             
            thumbnail: null
         | 
| 20 | 
             
            tags:
         | 
|  | |
| 21 | 
             
            - automatic-speech-recognition
         | 
| 22 | 
             
            - speech
         | 
| 23 | 
             
            - audio
         | 
| @@ -191,7 +192,7 @@ pip install nemo_toolkit['all'] | |
| 191 |  | 
| 192 | 
             
            ## How to Use this Model
         | 
| 193 |  | 
| 194 | 
            -
            The model is available for use in the NeMo toolkit [3], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset.
         | 
| 195 |  | 
| 196 | 
             
            ### Automatically instantiate the model
         | 
| 197 |  | 
| @@ -200,7 +201,7 @@ import nemo.collections.asr as nemo_asr | |
| 200 | 
             
            asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b")
         | 
| 201 | 
             
            ```
         | 
| 202 |  | 
| 203 | 
            -
            ### Transcribing using  | 
| 204 | 
             
            First, let's get a sample
         | 
| 205 | 
             
            ```
         | 
| 206 | 
             
            wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
         | 
| @@ -210,6 +211,79 @@ Then simply do: | |
| 210 | 
             
            asr_model.transcribe(['2086-149220-0033.wav'])
         | 
| 211 | 
             
            ```
         | 
| 212 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 213 | 
             
            ### Transcribing many audio files
         | 
| 214 |  | 
| 215 | 
             
            ```shell
         | 
|  | |
| 18 | 
             
            - MLCommons/peoples_speech
         | 
| 19 | 
             
            thumbnail: null
         | 
| 20 | 
             
            tags:
         | 
| 21 | 
            +
            - transformers
         | 
| 22 | 
             
            - automatic-speech-recognition
         | 
| 23 | 
             
            - speech
         | 
| 24 | 
             
            - audio
         | 
|  | |
| 192 |  | 
| 193 | 
             
            ## How to Use this Model
         | 
| 194 |  | 
| 195 | 
            +
            The model is available for use in the NeMo toolkit [3], and can be used as a pre-trained checkpoint for inference or for fine-tuning on another dataset. Moreover, you can now run Parakeet CTC natively with [Transformers](https://github.com/huggingface/transformers) 🤗.
         | 
| 196 |  | 
| 197 | 
             
            ### Automatically instantiate the model
         | 
| 198 |  | 
|  | |
| 201 | 
             
            asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b")
         | 
| 202 | 
             
            ```
         | 
| 203 |  | 
| 204 | 
            +
            ### Transcribing using NeMo
         | 
| 205 | 
             
            First, let's get a sample
         | 
| 206 | 
             
            ```
         | 
| 207 | 
             
            wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
         | 
|  | |
| 211 | 
             
            asr_model.transcribe(['2086-149220-0033.wav'])
         | 
| 212 | 
             
            ```
         | 
| 213 |  | 
| 214 | 
            +
            ### Transcribing using [Transformers](https://github.com/huggingface/transformers) 🤗 
         | 
| 215 | 
            +
             | 
| 216 | 
            +
            Make sure to install `transformers` from source.
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            ```bash
         | 
| 219 | 
            +
            pip install git+https://github.com/huggingface/transformers
         | 
| 220 | 
            +
            ```
         | 
| 221 | 
            +
             | 
| 222 | 
            +
            <details>
         | 
| 223 | 
            +
              <summary>➡️ Pipeline usage</summary>
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            ```python
         | 
| 226 | 
            +
            from transformers import pipeline
         | 
| 227 | 
            +
             | 
| 228 | 
            +
            pipe = pipeline("automatic-speech-recognition", model="nvidia/parakeet-ctc-0.6b")
         | 
| 229 | 
            +
            out = pipe("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")
         | 
| 230 | 
            +
            print(out)
         | 
| 231 | 
            +
            ```
         | 
| 232 | 
            +
            </details>
         | 
| 233 | 
            +
             | 
| 234 | 
            +
            <details>
         | 
| 235 | 
            +
              <summary>➡️ AutoModel</summary>
         | 
| 236 | 
            +
             | 
| 237 | 
            +
            ```python
         | 
| 238 | 
            +
            from transformers import AutoModelForCTC, AutoProcessor
         | 
| 239 | 
            +
            from datasets import load_dataset, Audio
         | 
| 240 | 
            +
            import torch
         | 
| 241 | 
            +
             | 
| 242 | 
            +
            device = "cuda" if torch.cuda.is_available() else "cpu"
         | 
| 243 | 
            +
             | 
| 244 | 
            +
            processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-0.6b")
         | 
| 245 | 
            +
            model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-0.6b", dtype="auto", device_map=device)
         | 
| 246 | 
            +
             | 
| 247 | 
            +
            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         | 
| 248 | 
            +
            ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
         | 
| 249 | 
            +
            speech_samples = [el['array'] for el in ds["audio"][:5]]
         | 
| 250 | 
            +
             | 
| 251 | 
            +
            inputs = processor(speech_samples, sampling_rate=processor.feature_extractor.sampling_rate)
         | 
| 252 | 
            +
            inputs.to(model.device, dtype=model.dtype)
         | 
| 253 | 
            +
            outputs = model.generate(**inputs)
         | 
| 254 | 
            +
            print(processor.batch_decode(outputs))
         | 
| 255 | 
            +
            ```
         | 
| 256 | 
            +
            </details>
         | 
| 257 | 
            +
             | 
| 258 | 
            +
            <details>
         | 
| 259 | 
            +
              <summary>➡️ Training</summary>
         | 
| 260 | 
            +
             | 
| 261 | 
            +
            ```python
         | 
| 262 | 
            +
            from transformers import AutoModelForCTC, AutoProcessor
         | 
| 263 | 
            +
            from datasets import load_dataset, Audio
         | 
| 264 | 
            +
            import torch
         | 
| 265 | 
            +
             | 
| 266 | 
            +
            device = "cuda" if torch.cuda.is_available() else "cpu"
         | 
| 267 | 
            +
             | 
| 268 | 
            +
            processor = AutoProcessor.from_pretrained("nvidia/parakeet-ctc-0.6b")
         | 
| 269 | 
            +
            model = AutoModelForCTC.from_pretrained("nvidia/parakeet-ctc-0.6b", dtype="auto", device_map=device)
         | 
| 270 | 
            +
             | 
| 271 | 
            +
            ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         | 
| 272 | 
            +
            ds = ds.cast_column("audio", Audio(sampling_rate=processor.feature_extractor.sampling_rate))
         | 
| 273 | 
            +
            speech_samples = [el['array'] for el in ds["audio"][:5]]
         | 
| 274 | 
            +
            text_samples = [el for el in ds["text"][:5]]
         | 
| 275 | 
            +
             | 
| 276 | 
            +
            # passing `text` to the processor will prepare inputs' `labels` key
         | 
| 277 | 
            +
            inputs = processor(audio=speech_samples, text=text_samples, sampling_rate=processor.feature_extractor.sampling_rate)
         | 
| 278 | 
            +
            inputs.to(device, dtype=model.dtype)
         | 
| 279 | 
            +
             | 
| 280 | 
            +
            outputs = model(**inputs)
         | 
| 281 | 
            +
            outputs.loss.backward()
         | 
| 282 | 
            +
            ```
         | 
| 283 | 
            +
            </details>
         | 
| 284 | 
            +
             | 
| 285 | 
            +
            For more details about usage, the refer to [Transformers' documentation](https://huggingface.co/docs/transformers/en/index).
         | 
| 286 | 
            +
             | 
| 287 | 
             
            ### Transcribing many audio files
         | 
| 288 |  | 
| 289 | 
             
            ```shell
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "architectures": [
         | 
| 3 | 
            +
                "ParakeetForCTC"
         | 
| 4 | 
            +
              ],
         | 
| 5 | 
            +
              "ctc_loss_reduction": "mean",
         | 
| 6 | 
            +
              "ctc_zero_infinity": true,
         | 
| 7 | 
            +
              "dtype": "bfloat16",
         | 
| 8 | 
            +
              "encoder_config": {
         | 
| 9 | 
            +
                "activation_dropout": 0.1,
         | 
| 10 | 
            +
                "attention_bias": true,
         | 
| 11 | 
            +
                "attention_dropout": 0.1,
         | 
| 12 | 
            +
                "conv_kernel_size": 9,
         | 
| 13 | 
            +
                "dropout": 0.1,
         | 
| 14 | 
            +
                "dropout_positions": 0.0,
         | 
| 15 | 
            +
                "hidden_act": "silu",
         | 
| 16 | 
            +
                "hidden_size": 1024,
         | 
| 17 | 
            +
                "initializer_range": 0.02,
         | 
| 18 | 
            +
                "intermediate_size": 4096,
         | 
| 19 | 
            +
                "layerdrop": 0.1,
         | 
| 20 | 
            +
                "max_position_embeddings": 5000,
         | 
| 21 | 
            +
                "model_type": "parakeet_encoder",
         | 
| 22 | 
            +
                "num_attention_heads": 8,
         | 
| 23 | 
            +
                "num_hidden_layers": 24,
         | 
| 24 | 
            +
                "num_key_value_heads": 8,
         | 
| 25 | 
            +
                "num_mel_bins": 80,
         | 
| 26 | 
            +
                "scale_input": true,
         | 
| 27 | 
            +
                "subsampling_conv_channels": 256,
         | 
| 28 | 
            +
                "subsampling_conv_kernel_size": 3,
         | 
| 29 | 
            +
                "subsampling_conv_stride": 2,
         | 
| 30 | 
            +
                "subsampling_factor": 8
         | 
| 31 | 
            +
              },
         | 
| 32 | 
            +
              "initializer_range": 0.02,
         | 
| 33 | 
            +
              "model_type": "parakeet_ctc",
         | 
| 34 | 
            +
              "pad_token_id": 1024,
         | 
| 35 | 
            +
              "transformers_version": "4.57.0.dev0",
         | 
| 36 | 
            +
              "vocab_size": 1025
         | 
| 37 | 
            +
            }
         | 
    	
        model.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a1f5b1d2bc66d4a812a1c8ea48a9aa8678f863e1bd3ecbec3c75abd720897fda
         | 
| 3 | 
            +
            size 2435506188
         | 
    	
        preprocessor_config.json
    ADDED
    
    | @@ -0,0 +1,13 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "feature_extractor_type": "ParakeetFeatureExtractor",
         | 
| 3 | 
            +
              "feature_size": 80,
         | 
| 4 | 
            +
              "hop_length": 160,
         | 
| 5 | 
            +
              "n_fft": 512,
         | 
| 6 | 
            +
              "padding_side": "right",
         | 
| 7 | 
            +
              "padding_value": 0.0,
         | 
| 8 | 
            +
              "preemphasis": 0.97,
         | 
| 9 | 
            +
              "processor_class": "ParakeetProcessor",
         | 
| 10 | 
            +
              "return_attention_mask": true,
         | 
| 11 | 
            +
              "sampling_rate": 16000,
         | 
| 12 | 
            +
              "win_length": 400
         | 
| 13 | 
            +
            }
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,16 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "pad_token": {
         | 
| 3 | 
            +
                "content": "<pad>",
         | 
| 4 | 
            +
                "lstrip": false,
         | 
| 5 | 
            +
                "normalized": false,
         | 
| 6 | 
            +
                "rstrip": false,
         | 
| 7 | 
            +
                "single_word": false
         | 
| 8 | 
            +
              },
         | 
| 9 | 
            +
              "unk_token": {
         | 
| 10 | 
            +
                "content": "<unk>",
         | 
| 11 | 
            +
                "lstrip": false,
         | 
| 12 | 
            +
                "normalized": false,
         | 
| 13 | 
            +
                "rstrip": false,
         | 
| 14 | 
            +
                "single_word": false
         | 
| 15 | 
            +
              }
         | 
| 16 | 
            +
            }
         | 
    	
        tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,27 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "added_tokens_decoder": {
         | 
| 3 | 
            +
                "0": {
         | 
| 4 | 
            +
                  "content": "<unk>",
         | 
| 5 | 
            +
                  "lstrip": false,
         | 
| 6 | 
            +
                  "normalized": false,
         | 
| 7 | 
            +
                  "rstrip": false,
         | 
| 8 | 
            +
                  "single_word": false,
         | 
| 9 | 
            +
                  "special": true
         | 
| 10 | 
            +
                },
         | 
| 11 | 
            +
                "1024": {
         | 
| 12 | 
            +
                  "content": "<pad>",
         | 
| 13 | 
            +
                  "lstrip": false,
         | 
| 14 | 
            +
                  "normalized": false,
         | 
| 15 | 
            +
                  "rstrip": false,
         | 
| 16 | 
            +
                  "single_word": false,
         | 
| 17 | 
            +
                  "special": true
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              },
         | 
| 20 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 21 | 
            +
              "extra_special_tokens": {},
         | 
| 22 | 
            +
              "model_max_length": 1000000000000000019884624838656,
         | 
| 23 | 
            +
              "pad_token": "<pad>",
         | 
| 24 | 
            +
              "processor_class": "ParakeetProcessor",
         | 
| 25 | 
            +
              "tokenizer_class": "ParakeetTokenizerFast",
         | 
| 26 | 
            +
              "unk_token": "<unk>"
         | 
| 27 | 
            +
            }
         | 

 
		