Commit 
							
							·
						
						ce8b4b9
	
1
								Parent(s):
							
							8cba305
								
Training in progress, step 1000
Browse files- .gitignore +1 -0
 - .ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb +1205 -0
 - added_tokens.json +109 -0
 - config.json +42 -0
 - fine-tune-whisper-streaming.ipynb +1287 -0
 - merges.txt +0 -0
 - normalizer.json +1742 -0
 - preprocessor_config.json +0 -0
 - pytorch_model.bin +3 -0
 - runs/Dec14_14-23-12_132-145-140-45/1671027857.0917404/events.out.tfevents.1671027857.132-145-140-45.618344.1 +3 -0
 - runs/Dec14_14-23-12_132-145-140-45/events.out.tfevents.1671027857.132-145-140-45.618344.0 +3 -0
 - special_tokens_map.json +133 -0
 - tokenizer_config.json +36 -0
 - training_args.bin +3 -0
 - vocab.json +0 -0
 
    	
        .gitignore
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            checkpoint-*/
         
     | 
    	
        .ipynb_checkpoints/fine-tune-whisper-streaming-checkpoint.ipynb
    ADDED
    
    | 
         @@ -0,0 +1,1205 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
             "cells": [
         
     | 
| 3 | 
         
            +
              {
         
     | 
| 4 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 5 | 
         
            +
               "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6",
         
     | 
| 6 | 
         
            +
               "metadata": {},
         
     | 
| 7 | 
         
            +
               "source": [
         
     | 
| 8 | 
         
            +
                "# Fine-Tune Whisper With 🤗 Transformers and Streaming Mode"
         
     | 
| 9 | 
         
            +
               ]
         
     | 
| 10 | 
         
            +
              },
         
     | 
| 11 | 
         
            +
              {
         
     | 
| 12 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 13 | 
         
            +
               "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a",
         
     | 
| 14 | 
         
            +
               "metadata": {},
         
     | 
| 15 | 
         
            +
               "source": [
         
     | 
| 16 | 
         
            +
                "In this Colab, we present a step-by-step guide on fine-tuning Whisper with Hugging Face 🤗 Transformers on 400 hours of speech data! Using streaming mode, we'll show how you can train a speech recongition model on any dataset, irrespective of size. With streaming mode, storage requirements are no longer a consideration: you can train a model on whatever dataset you want, even if it's download size exceeds your devices disk space. How can this be possible? It simply seems too good to be true! Well, rest assured it's not 😉 Carry on reading to find out more."
         
     | 
| 17 | 
         
            +
               ]
         
     | 
| 18 | 
         
            +
              },
         
     | 
| 19 | 
         
            +
              {
         
     | 
| 20 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 21 | 
         
            +
               "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e",
         
     | 
| 22 | 
         
            +
               "metadata": {},
         
     | 
| 23 | 
         
            +
               "source": [
         
     | 
| 24 | 
         
            +
                "## Introduction"
         
     | 
| 25 | 
         
            +
               ]
         
     | 
| 26 | 
         
            +
              },
         
     | 
| 27 | 
         
            +
              {
         
     | 
| 28 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 29 | 
         
            +
               "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0",
         
     | 
| 30 | 
         
            +
               "metadata": {},
         
     | 
| 31 | 
         
            +
               "source": [
         
     | 
| 32 | 
         
            +
                "Speech recognition datasets are large. A typical speech dataset consists of approximately 100 hours of audio-transcription data, requiring upwards of 130GB of storage space for download and preparation. For most ASR researchers, this is already at the upper limit of what is feasible for disk space. So what happens when we want to train on a larger dataset? The full [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) dataset consists of 960 hours of audio data. Kensho's [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) contains 5,000 hours of audio data. ML Commons [People's Speech](https://huggingface.co/datasets/MLCommons/peoples_speech) contains **30,000+** hours of audio data! Do we need to bite the bullet and buy additional storage? Or is there a way we can train on all of these datasets with no disk drive requirements?\n",
         
     | 
| 33 | 
         
            +
                "\n",
         
     | 
| 34 | 
         
            +
                "When training machine learning systems, we rarely use the entire dataset at once. We typically _batch_ our data into smaller subsets of data, and pass these incrementally through our training pipeline. This is because we train our system on an accelerator device, such as a GPU or TPU, which has a memory limit typically around 16GB. We have to fit our model, optimiser and training data all on the same accelerator device, so we usually have to divide the dataset up into smaller batches and move them from the CPU to the GPU when required.\n",
         
     | 
| 35 | 
         
            +
                "\n",
         
     | 
| 36 | 
         
            +
                "Consequently, we don't require the entire dataset to be downloaded at once; we simply need the batch of data that we pass to our model at any one go. We can leverage this principle of partial dataset loading when preparing our dataset: rather than downloading the entire dataset at the start, we can load each piece of data as and when we need it. For each batch, we load the relevant data from a remote server and pass it through the training pipeline. For the next batch, we load the next items and again pass them through the training pipeline. At no point do we have to save data to our disk drive, we simply load them in memory and use them in our pipeline. In doing so, we only ever need as much memory as each individual batch requires.\n",
         
     | 
| 37 | 
         
            +
                "\n",
         
     | 
| 38 | 
         
            +
                "This is analogous to downloading a TV show versus streaming it 📺 When we download a TV show, we download the entire video offline and save it to our disk. Compare this to when we stream a TV show. Here, we don't download any part of the video to memory, but iterate over the video file and load each part in real-time as required. It's this same principle that we can apply to our ML training pipeline! We want to iterate over the dataset and load each sample of data as required.\n",
         
     | 
| 39 | 
         
            +
                "\n",
         
     | 
| 40 | 
         
            +
                "While the principle of partial dataset loading sounds ideal, it also seems **pretty** difficult to do. Luckily for us, 🤗 Datasets allows us to do this with minimal code changes! We'll make use of the principle of [_streaming_](https://huggingface.co/docs/datasets/stream), depicted graphically in Figure 1. Streaming does exactly this: the data is loaded progressively as we iterate over the dataset, meaning it is only loaded as and when we need it. If you're familiar with 🤗 Transformers and Datasets, the content of this notebook will be very familiar, with some small extensions to support streaming mode."
         
     | 
| 41 | 
         
            +
               ]
         
     | 
| 42 | 
         
            +
              },
         
     | 
| 43 | 
         
            +
              {
         
     | 
| 44 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 45 | 
         
            +
               "id": "1c87f76e-47be-4a5d-bc52-7b1c2e9d4f5a",
         
     | 
| 46 | 
         
            +
               "metadata": {},
         
     | 
| 47 | 
         
            +
               "source": [
         
     | 
| 48 | 
         
            +
                "<figure>\n",
         
     | 
| 49 | 
         
            +
                "<img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/streaming.gif\" alt=\"Trulli\" style=\"width:100%\">\n",
         
     | 
| 50 | 
         
            +
                "<figcaption align = \"center\"><b>Figure 1:</b> Streaming mode. The dataset is divided into smaller subsets, with subsets loaded progressively as we iterate over the dataset. </figcaption>\n",
         
     | 
| 51 | 
         
            +
                "</figure>"
         
     | 
| 52 | 
         
            +
               ]
         
     | 
| 53 | 
         
            +
              },
         
     | 
| 54 | 
         
            +
              {
         
     | 
| 55 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 56 | 
         
            +
               "id": "21b6316e-8a55-4549-a154-66d3da2ab74a",
         
     | 
| 57 | 
         
            +
               "metadata": {},
         
     | 
| 58 | 
         
            +
               "source": [
         
     | 
| 59 | 
         
            +
                "This notebook provides a guide to fine-tuning on the task of _speech recognition_, which involves learning a\n",
         
     | 
| 60 | 
         
            +
                "mapping from speech to text. Speech recognition is divided into two categories: English-only or multilingual (all other languages). \n",
         
     | 
| 61 | 
         
            +
                "This notebook applies to both categories, with pointers for changing between languages and datasets.\n",
         
     | 
| 62 | 
         
            +
                "\n",
         
     | 
| 63 | 
         
            +
                "As for our model, we'll fine-tune the Whisper model released in [September 2022](https://openai.com/blog/whisper/) by the authors \n",
         
     | 
| 64 | 
         
            +
                "Alec Radford et al. from OpenAI. Whisper is an encoder-decoder model pre-trained on 680k hours of labelled audio-transcription data. \n",
         
     | 
| 65 | 
         
            +
                "It achieves strong performance on many speech recognition and speech translation datasets without fine-tuning. With fine-tuning, \n",
         
     | 
| 66 | 
         
            +
                "we aim to improve upon these results further, with many SoTA results up for grabs! For a full explanation on the Whisper model, the \n",
         
     | 
| 67 | 
         
            +
                "reader is advised to read the blog post [Fine-Tune Whisper with 🤗 Transformers](https://huggingface.co/blog/fine-tune-whisper#introduction).\n",
         
     | 
| 68 | 
         
            +
                "\n",
         
     | 
| 69 | 
         
            +
                "The Whisper checkpoints come in five configurations of varying model sizes.\n",
         
     | 
| 70 | 
         
            +
                "The smallest four are trained on either English-only or multilingual data.\n",
         
     | 
| 71 | 
         
            +
                "The largest checkpoint is multilingual only. All nine of the pre-trained checkpoints \n",
         
     | 
| 72 | 
         
            +
                "are available on the [Hugging Face Hub](https://huggingface.co/models?search=openai/whisper). The \n",
         
     | 
| 73 | 
         
            +
                "checkpoints are summarised in the following table with links to the models on the Hub:\n",
         
     | 
| 74 | 
         
            +
                "\n",
         
     | 
| 75 | 
         
            +
                "| Size   | Layers | Width | Heads | Parameters | English-only                                         | Multilingual                                      |\n",
         
     | 
| 76 | 
         
            +
                "|--------|--------|-------|-------|------------|------------------------------------------------------|---------------------------------------------------|\n",
         
     | 
| 77 | 
         
            +
                "| tiny   | 4      | 384   | 6     | 39 M       | [✓](https://huggingface.co/openai/whisper-tiny.en)   | [✓](https://huggingface.co/openai/whisper-tiny.)  |\n",
         
     | 
| 78 | 
         
            +
                "| base   | 6      | 512   | 8     | 74 M       | [✓](https://huggingface.co/openai/whisper-base.en)   | [✓](https://huggingface.co/openai/whisper-base)   |\n",
         
     | 
| 79 | 
         
            +
                "| small  | 12     | 768   | 12    | 244 M      | [✓](https://huggingface.co/openai/whisper-small.en)  | [✓](https://huggingface.co/openai/whisper-small)  |\n",
         
     | 
| 80 | 
         
            +
                "| medium | 24     | 1024  | 16    | 769 M      | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) |\n",
         
     | 
| 81 | 
         
            +
                "| large  | 32     | 1280  | 20    | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large)  |\n",
         
     | 
| 82 | 
         
            +
                "\n",
         
     | 
| 83 | 
         
            +
                "When fine-tuning on an English dataset for speech recognition, it is recommeneded to select one of the English-only checkpoints. For any other language, it is recommended to select a multilingual checkpoint.\n",
         
     | 
| 84 | 
         
            +
                "\n",
         
     | 
| 85 | 
         
            +
                "For demonstration purposes, we'll fine-tune the multilingual version of the \n",
         
     | 
| 86 | 
         
            +
                "[`\"small\"`](https://huggingface.co/openai/whisper-small) checkpoint with 244M params (~= 1GB). \n",
         
     | 
| 87 | 
         
            +
                "As for our data, we'll train and evaluate our system on 400 hours of multilingual speech recognition data\n",
         
     | 
| 88 | 
         
            +
                "taken from the [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)\n",
         
     | 
| 89 | 
         
            +
                "dataset. We'll show how we can train a model on 400 hours of training data using the default disk space \n",
         
     | 
| 90 | 
         
            +
                "that comes with a standard GPU device or Google Colab."
         
     | 
| 91 | 
         
            +
               ]
         
     | 
| 92 | 
         
            +
              },
         
     | 
| 93 | 
         
            +
              {
         
     | 
| 94 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 95 | 
         
            +
               "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0",
         
     | 
| 96 | 
         
            +
               "metadata": {},
         
     | 
| 97 | 
         
            +
               "source": [
         
     | 
| 98 | 
         
            +
                "## Load Dataset with Streaming"
         
     | 
| 99 | 
         
            +
               ]
         
     | 
| 100 | 
         
            +
              },
         
     | 
| 101 | 
         
            +
              {
         
     | 
| 102 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 103 | 
         
            +
               "id": "b17a4763-4381-4157-ae38-b04a8b5f1c43",
         
     | 
| 104 | 
         
            +
               "metadata": {},
         
     | 
| 105 | 
         
            +
               "source": [
         
     | 
| 106 | 
         
            +
                "This is where the magic happens! We'll first write a wrapper function around 🤗 Datasets `load_dataset` method. This function downloads the required splits using streaming mode by forcing `streaming=True` in the `load_dataset` method. Multiple splits can be combined (interleaved) by concatenating them with the \"+\" symbol when specifying the split name, e.g. `split=train+validation` will return a single split with the training and validation splits interleaved together. The function has the same arguments and key-word arguments as 🤗 Datasets `load_dataset` method, so we can use it in exactly the same way!"
         
     | 
| 107 | 
         
            +
               ]
         
     | 
| 108 | 
         
            +
              },
         
     | 
| 109 | 
         
            +
              {
         
     | 
| 110 | 
         
            +
               "cell_type": "code",
         
     | 
| 111 | 
         
            +
               "execution_count": 1,
         
     | 
| 112 | 
         
            +
               "id": "065a8cf7-e54f-4ac3-900e-609c80714fca",
         
     | 
| 113 | 
         
            +
               "metadata": {},
         
     | 
| 114 | 
         
            +
               "outputs": [],
         
     | 
| 115 | 
         
            +
               "source": [
         
     | 
| 116 | 
         
            +
                "from datasets import interleave_datasets, load_dataset\n",
         
     | 
| 117 | 
         
            +
                "\n",
         
     | 
| 118 | 
         
            +
                "def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):\n",
         
     | 
| 119 | 
         
            +
                "    if \"+\" in split:\n",
         
     | 
| 120 | 
         
            +
                "        # load multiple splits separated by the `+` symbol *with* streaming mode\n",
         
     | 
| 121 | 
         
            +
                "        dataset_splits = [load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs) for split_name in split.split(\"+\")]\n",
         
     | 
| 122 | 
         
            +
                "        # interleave multiple splits to form one dataset\n",
         
     | 
| 123 | 
         
            +
                "        interleaved_dataset = interleave_datasets(dataset_splits)\n",
         
     | 
| 124 | 
         
            +
                "        return interleaved_dataset\n",
         
     | 
| 125 | 
         
            +
                "    else:\n",
         
     | 
| 126 | 
         
            +
                "        # load a single split *with* streaming mode\n",
         
     | 
| 127 | 
         
            +
                "        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)\n",
         
     | 
| 128 | 
         
            +
                "        return dataset"
         
     | 
| 129 | 
         
            +
               ]
         
     | 
| 130 | 
         
            +
              },
         
     | 
| 131 | 
         
            +
              {
         
     | 
| 132 | 
         
            +
               "cell_type": "code",
         
     | 
| 133 | 
         
            +
               "execution_count": 2,
         
     | 
| 134 | 
         
            +
               "id": "ed0df0dc-8c2a-47c9-b105-49d61aec9890",
         
     | 
| 135 | 
         
            +
               "metadata": {},
         
     | 
| 136 | 
         
            +
               "outputs": [],
         
     | 
| 137 | 
         
            +
               "source": [
         
     | 
| 138 | 
         
            +
                "from datasets import Audio, interleave_datasets, IterableDataset, load_dataset\n",
         
     | 
| 139 | 
         
            +
                "from typing import List, Optional"
         
     | 
| 140 | 
         
            +
               ]
         
     | 
| 141 | 
         
            +
              },
         
     | 
| 142 | 
         
            +
              {
         
     | 
| 143 | 
         
            +
               "cell_type": "code",
         
     | 
| 144 | 
         
            +
               "execution_count": 3,
         
     | 
| 145 | 
         
            +
               "id": "fa07e8c0-1874-43e7-8eec-fac124d0cdfe",
         
     | 
| 146 | 
         
            +
               "metadata": {},
         
     | 
| 147 | 
         
            +
               "outputs": [],
         
     | 
| 148 | 
         
            +
               "source": [
         
     | 
| 149 | 
         
            +
                "dataset_names = [\"mozilla-foundation/common_voice_11_0\", \"google/fleurs\"]\n",
         
     | 
| 150 | 
         
            +
                "dataset_config_names = [\"es\", \"es_419\"]\n",
         
     | 
| 151 | 
         
            +
                "text_column_names = [\"sentence\", \"raw_transcription\"]"
         
     | 
| 152 | 
         
            +
               ]
         
     | 
| 153 | 
         
            +
              },
         
     | 
| 154 | 
         
            +
              {
         
     | 
| 155 | 
         
            +
               "cell_type": "code",
         
     | 
| 156 | 
         
            +
               "execution_count": 4,
         
     | 
| 157 | 
         
            +
               "id": "88a7949b-60e2-4269-94da-e18d24dc3788",
         
     | 
| 158 | 
         
            +
               "metadata": {},
         
     | 
| 159 | 
         
            +
               "outputs": [],
         
     | 
| 160 | 
         
            +
               "source": [
         
     | 
| 161 | 
         
            +
                "def load_multiple_streaming_datasets(\n",
         
     | 
| 162 | 
         
            +
                "    dataset_names: List,\n",
         
     | 
| 163 | 
         
            +
                "    dataset_config_names: List,\n",
         
     | 
| 164 | 
         
            +
                "    splits: Optional[List] = None,\n",
         
     | 
| 165 | 
         
            +
                "    text_column_names: Optional[List] = None,\n",
         
     | 
| 166 | 
         
            +
                "    sampling_rate: Optional[int] = 16000,\n",
         
     | 
| 167 | 
         
            +
                "    stopping_strategy: Optional[str] = \"all_exhausted\",\n",
         
     | 
| 168 | 
         
            +
                "    **kwargs\n",
         
     | 
| 169 | 
         
            +
                ") -> IterableDataset:\n",
         
     | 
| 170 | 
         
            +
                "\n",
         
     | 
| 171 | 
         
            +
                "    if len(dataset_names) != len(dataset_config_names):\n",
         
     | 
| 172 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 173 | 
         
            +
                "            f\"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
         
     | 
| 174 | 
         
            +
                "            f\" {len(dataset_config_names)} configs.\"\n",
         
     | 
| 175 | 
         
            +
                "        )\n",
         
     | 
| 176 | 
         
            +
                "\n",
         
     | 
| 177 | 
         
            +
                "    if splits is not None and len(splits) != len(dataset_names):\n",
         
     | 
| 178 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 179 | 
         
            +
                "            f\"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits.\"\n",
         
     | 
| 180 | 
         
            +
                "        )\n",
         
     | 
| 181 | 
         
            +
                "\n",
         
     | 
| 182 | 
         
            +
                "    if text_column_names is not None and len(text_column_names) != len(dataset_names):\n",
         
     | 
| 183 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 184 | 
         
            +
                "            f\"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
         
     | 
| 185 | 
         
            +
                "            f\" {len(text_column_names)} text column names.\"\n",
         
     | 
| 186 | 
         
            +
                "        )\n",
         
     | 
| 187 | 
         
            +
                "\n",
         
     | 
| 188 | 
         
            +
                "    splits = splits if splits is not None else [\"train\" for i in range(len(dataset_names))]\n",
         
     | 
| 189 | 
         
            +
                "    text_column_names = (\n",
         
     | 
| 190 | 
         
            +
                "        text_column_names if text_column_names is not None else [\"text\" for i in range(len(dataset_names))]\n",
         
     | 
| 191 | 
         
            +
                "    )\n",
         
     | 
| 192 | 
         
            +
                "\n",
         
     | 
| 193 | 
         
            +
                "    all_datasets = []\n",
         
     | 
| 194 | 
         
            +
                "    # iterate over the datasets we want to interleave\n",
         
     | 
| 195 | 
         
            +
                "    for i, dataset_name in enumerate(dataset_names):\n",
         
     | 
| 196 | 
         
            +
                "        dataset = load_dataset(dataset_name, dataset_config_names[i], split=splits[i], streaming=True, **kwargs)\n",
         
     | 
| 197 | 
         
            +
                "        # resample to specified sampling rate\n",
         
     | 
| 198 | 
         
            +
                "        dataset = dataset.cast_column(\"audio\", Audio(sampling_rate))\n",
         
     | 
| 199 | 
         
            +
                "        #  normalise columns to [\"audio\", \"sentence\"]\n",
         
     | 
| 200 | 
         
            +
                "        if text_column_names[i] != \"sentence\":\n",
         
     | 
| 201 | 
         
            +
                "            dataset = dataset.rename_column(text_column_names[i], \"sentence\")\n",
         
     | 
| 202 | 
         
            +
                "        dataset = dataset.remove_columns(set(dataset.features.keys()) - set([\"audio\", \"sentence\"]))\n",
         
     | 
| 203 | 
         
            +
                "        all_datasets.append(dataset)\n",
         
     | 
| 204 | 
         
            +
                "\n",
         
     | 
| 205 | 
         
            +
                "    interleaved_dataset = interleave_datasets(all_datasets, stopping_strategy=stopping_strategy)\n",
         
     | 
| 206 | 
         
            +
                "    return interleaved_dataset"
         
     | 
| 207 | 
         
            +
               ]
         
     | 
| 208 | 
         
            +
              },
         
     | 
| 209 | 
         
            +
              {
         
     | 
| 210 | 
         
            +
               "cell_type": "code",
         
     | 
| 211 | 
         
            +
               "execution_count": 5,
         
     | 
| 212 | 
         
            +
               "id": "1f3e756f-f55f-4077-951f-6d04930bf5d2",
         
     | 
| 213 | 
         
            +
               "metadata": {},
         
     | 
| 214 | 
         
            +
               "outputs": [],
         
     | 
| 215 | 
         
            +
               "source": [
         
     | 
| 216 | 
         
            +
                "traind_ds = load_multiple_streaming_datasets(dataset_names, dataset_config_names=dataset_config_names, text_column_names=text_column_names, use_auth_token=True)"
         
     | 
| 217 | 
         
            +
               ]
         
     | 
| 218 | 
         
            +
              },
         
     | 
| 219 | 
         
            +
              {
         
     | 
| 220 | 
         
            +
               "cell_type": "code",
         
     | 
| 221 | 
         
            +
               "execution_count": 6,
         
     | 
| 222 | 
         
            +
               "id": "0155ff1a-8a3e-406b-8700-dcaafd9535cf",
         
     | 
| 223 | 
         
            +
               "metadata": {},
         
     | 
| 224 | 
         
            +
               "outputs": [
         
     | 
| 225 | 
         
            +
                {
         
     | 
| 226 | 
         
            +
                 "name": "stderr",
         
     | 
| 227 | 
         
            +
                 "output_type": "stream",
         
     | 
| 228 | 
         
            +
                 "text": [
         
     | 
| 229 | 
         
            +
                  "Reading metadata...: 230467it [00:04, 56208.39it/s]\n"
         
     | 
| 230 | 
         
            +
                 ]
         
     | 
| 231 | 
         
            +
                },
         
     | 
| 232 | 
         
            +
                {
         
     | 
| 233 | 
         
            +
                 "name": "stdout",
         
     | 
| 234 | 
         
            +
                 "output_type": "stream",
         
     | 
| 235 | 
         
            +
                 "text": [
         
     | 
| 236 | 
         
            +
                  "0 ¿ Qué tal a tres de cinco ?\n",
         
     | 
| 237 | 
         
            +
                  "1 El uso de Internet y de la red informática mundial permite que los estudiantes tengan acceso a la información en todo momento.\n",
         
     | 
| 238 | 
         
            +
                  "2 vamos , quiero decir , que no soy de citas especiales .\n",
         
     | 
| 239 | 
         
            +
                  "3 Los deportes de nieve en descenso, como el esquí y la tablanieve, son disciplinas populares que consisten en deslizarse con esquís o una tabla fijada a los pies, sobre un terreno nevado.\n",
         
     | 
| 240 | 
         
            +
                  "4 fray Lope , en aquel momento , colmaba otro vaso igual :\n",
         
     | 
| 241 | 
         
            +
                  "5 El título de «capital de la moda» fue traspasado, a partir de entonces, de Constantinopla a París.\n",
         
     | 
| 242 | 
         
            +
                  "6 hermanito . dice hermanito . anda ...\n",
         
     | 
| 243 | 
         
            +
                  "7 Diez años después, estuvo al mando de la participación soviética en la misión Apollo-Soyuz, que simbolizaba el fin de la Carrera Espacial.\n",
         
     | 
| 244 | 
         
            +
                  "8 tengo un mensaje para usted\n",
         
     | 
| 245 | 
         
            +
                  "9 Haga averiguaciones en la institución y en la oficina de inmigración del destino donde pretende estudiar, a fin de tomar conocimiento en detalle los requisitos.\n"
         
     | 
| 246 | 
         
            +
                 ]
         
     | 
| 247 | 
         
            +
                }
         
     | 
| 248 | 
         
            +
               ],
         
     | 
| 249 | 
         
            +
               "source": [
         
     | 
| 250 | 
         
            +
                "for i, sample in enumerate(traind_ds):\n",
         
     | 
| 251 | 
         
            +
                "    print(i, sample[\"sentence\"])\n",
         
     | 
| 252 | 
         
            +
                "    if i == 9:\n",
         
     | 
| 253 | 
         
            +
                "        break"
         
     | 
| 254 | 
         
            +
               ]
         
     | 
| 255 | 
         
            +
              },
         
     | 
| 256 | 
         
            +
              {
         
     | 
| 257 | 
         
            +
               "cell_type": "code",
         
     | 
| 258 | 
         
            +
               "execution_count": 7,
         
     | 
| 259 | 
         
            +
               "id": "d8be8403-334f-4485-aff0-55f2a3cc3680",
         
     | 
| 260 | 
         
            +
               "metadata": {},
         
     | 
| 261 | 
         
            +
               "outputs": [
         
     | 
| 262 | 
         
            +
                {
         
     | 
| 263 | 
         
            +
                 "data": {
         
     | 
| 264 | 
         
            +
                  "text/plain": [
         
     | 
| 265 | 
         
            +
                   "{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),\n",
         
     | 
| 266 | 
         
            +
                   " 'sentence': Value(dtype='string', id=None)}"
         
     | 
| 267 | 
         
            +
                  ]
         
     | 
| 268 | 
         
            +
                 },
         
     | 
| 269 | 
         
            +
                 "execution_count": 7,
         
     | 
| 270 | 
         
            +
                 "metadata": {},
         
     | 
| 271 | 
         
            +
                 "output_type": "execute_result"
         
     | 
| 272 | 
         
            +
                }
         
     | 
| 273 | 
         
            +
               ],
         
     | 
| 274 | 
         
            +
               "source": [
         
     | 
| 275 | 
         
            +
                "traind_ds.features"
         
     | 
| 276 | 
         
            +
               ]
         
     | 
| 277 | 
         
            +
              },
         
     | 
| 278 | 
         
            +
              {
         
     | 
| 279 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 280 | 
         
            +
               "id": "674429c5-0ab4-4adf-975b-621bb69eca38",
         
     | 
| 281 | 
         
            +
               "metadata": {},
         
     | 
| 282 | 
         
            +
               "source": [
         
     | 
| 283 | 
         
            +
                "We'll train our system on the Spanish split of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). We can see how much training data we have by viewing the [language page](https://commonvoice.mozilla.org/en/datasets) on the Common Voice website. The Spanish split has over 400 hours of labelled training data - that's enourmous! More than we could ever fit on a Google Colab or a standard workstation. But with streaming mode, we'll only download data as and when we need it, making training on this dataset possible!\n",
         
     | 
| 284 | 
         
            +
                "\n",
         
     | 
| 285 | 
         
            +
                "Since Spanish is relatively high-resource, we'll only use the `train` split for training and the `test` split for evaluation. If you're training on a low-resource language, such as the Hindi split of Common Voice 11, it's worth combining the `train` and `validation` splits to give a larger training set. You can achieve this by setting: `split=\"train+validation\"` for the training split.\n",
         
     | 
| 286 | 
         
            +
                "\n",
         
     | 
| 287 | 
         
            +
                "If you're using a gated dataset, like Common Voice 11, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to load the data locally."
         
     | 
| 288 | 
         
            +
               ]
         
     | 
| 289 | 
         
            +
              },
         
     | 
| 290 | 
         
            +
              {
         
     | 
| 291 | 
         
            +
               "cell_type": "code",
         
     | 
| 292 | 
         
            +
               "execution_count": 14,
         
     | 
| 293 | 
         
            +
               "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
         
     | 
| 294 | 
         
            +
               "metadata": {},
         
     | 
| 295 | 
         
            +
               "outputs": [],
         
     | 
| 296 | 
         
            +
               "source": [
         
     | 
| 297 | 
         
            +
                "from datasets import IterableDatasetDict\n",
         
     | 
| 298 | 
         
            +
                "\n",
         
     | 
| 299 | 
         
            +
                "raw_datasets = IterableDatasetDict()\n",
         
     | 
| 300 | 
         
            +
                "\n",
         
     | 
| 301 | 
         
            +
                "raw_datasets[\"train\"] = traind_ds  # set split=\"train+validation\" for low-resource\n",
         
     | 
| 302 | 
         
            +
                "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"es\", split=\"test\", use_auth_token=True)\n",
         
     | 
| 303 | 
         
            +
                "# raw_datasets[\"train\"] = traind_ds"
         
     | 
| 304 | 
         
            +
               ]
         
     | 
| 305 | 
         
            +
              },
         
     | 
| 306 | 
         
            +
              {
         
     | 
| 307 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 308 | 
         
            +
               "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605",
         
     | 
| 309 | 
         
            +
               "metadata": {},
         
     | 
| 310 | 
         
            +
               "source": [
         
     | 
| 311 | 
         
            +
                "## Prepare Processor and Pre-Process Data"
         
     | 
| 312 | 
         
            +
               ]
         
     | 
| 313 | 
         
            +
              },
         
     | 
| 314 | 
         
            +
              {
         
     | 
| 315 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 316 | 
         
            +
               "id": "601c3099-1026-439e-93e2-5635b3ba5a73",
         
     | 
| 317 | 
         
            +
               "metadata": {},
         
     | 
| 318 | 
         
            +
               "source": [
         
     | 
| 319 | 
         
            +
                "The ASR pipeline can be de-composed into three stages: \n",
         
     | 
| 320 | 
         
            +
                "1) A feature extractor which pre-processes the raw audio-inputs\n",
         
     | 
| 321 | 
         
            +
                "2) The model which performs the sequence-to-sequence mapping \n",
         
     | 
| 322 | 
         
            +
                "3) A tokenizer which post-processes the model outputs to text format\n",
         
     | 
| 323 | 
         
            +
                "\n",
         
     | 
| 324 | 
         
            +
                "In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, \n",
         
     | 
| 325 | 
         
            +
                "called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor)\n",
         
     | 
| 326 | 
         
            +
                "and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) \n",
         
     | 
| 327 | 
         
            +
                "respectively. To make our lives simple, these two objects are wrapped under a single class, called the [WhisperProcessor](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperProcessor). We can call the WhisperProcessor to perform \n",
         
     | 
| 328 | 
         
            +
                "both the audio pre-processing and the text token post-processing. In doing so, we only need to keep track of two objects during training: \n",
         
     | 
| 329 | 
         
            +
                "the `processor` and the `model`.\n",
         
     | 
| 330 | 
         
            +
                "\n",
         
     | 
| 331 | 
         
            +
                "If using a multilingual checkpoint, you should set the `\"language\"` to your target text language. You should also set the task to `\"transcribe\"` for speech recogntition and `\"translate\"` for speech translation. These arguments modify the behaviour of the tokenizer - they should be set correctly to ensure the target labels are encoded properly. These arguments should be omitted for English-only fine-tuning."
         
     | 
| 332 | 
         
            +
               ]
         
     | 
| 333 | 
         
            +
              },
         
     | 
| 334 | 
         
            +
              {
         
     | 
| 335 | 
         
            +
               "cell_type": "code",
         
     | 
| 336 | 
         
            +
               "execution_count": 15,
         
     | 
| 337 | 
         
            +
               "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
         
     | 
| 338 | 
         
            +
               "metadata": {},
         
     | 
| 339 | 
         
            +
               "outputs": [],
         
     | 
| 340 | 
         
            +
               "source": [
         
     | 
| 341 | 
         
            +
                "from transformers import WhisperProcessor\n",
         
     | 
| 342 | 
         
            +
                "\n",
         
     | 
| 343 | 
         
            +
                "processor = WhisperProcessor.from_pretrained(\"juancopi81/whisper-medium-es\", language=\"Spanish\", task=\"transcribe\")"
         
     | 
| 344 | 
         
            +
               ]
         
     | 
| 345 | 
         
            +
              },
         
     | 
| 346 | 
         
            +
              {
         
     | 
| 347 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 348 | 
         
            +
               "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c",
         
     | 
| 349 | 
         
            +
               "metadata": {},
         
     | 
| 350 | 
         
            +
               "source": [
         
     | 
| 351 | 
         
            +
                "### Pre-Process Data"
         
     | 
| 352 | 
         
            +
               ]
         
     | 
| 353 | 
         
            +
              },
         
     | 
| 354 | 
         
            +
              {
         
     | 
| 355 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 356 | 
         
            +
               "id": "bf10cd3e-924e-44fc-8790-46e413de7b3d",
         
     | 
| 357 | 
         
            +
               "metadata": {},
         
     | 
| 358 | 
         
            +
               "source": [
         
     | 
| 359 | 
         
            +
                "Let's have a look at the dataset features. Pay particular attention to the `\"audio\"` column - this details the sampling rate of our audio inputs:"
         
     | 
| 360 | 
         
            +
               ]
         
     | 
| 361 | 
         
            +
              },
         
     | 
| 362 | 
         
            +
              {
         
     | 
| 363 | 
         
            +
               "cell_type": "code",
         
     | 
| 364 | 
         
            +
               "execution_count": 16,
         
     | 
| 365 | 
         
            +
               "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
         
     | 
| 366 | 
         
            +
               "metadata": {},
         
     | 
| 367 | 
         
            +
               "outputs": [
         
     | 
| 368 | 
         
            +
                {
         
     | 
| 369 | 
         
            +
                 "data": {
         
     | 
| 370 | 
         
            +
                  "text/plain": [
         
     | 
| 371 | 
         
            +
                   "{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),\n",
         
     | 
| 372 | 
         
            +
                   " 'sentence': Value(dtype='string', id=None)}"
         
     | 
| 373 | 
         
            +
                  ]
         
     | 
| 374 | 
         
            +
                 },
         
     | 
| 375 | 
         
            +
                 "execution_count": 16,
         
     | 
| 376 | 
         
            +
                 "metadata": {},
         
     | 
| 377 | 
         
            +
                 "output_type": "execute_result"
         
     | 
| 378 | 
         
            +
                }
         
     | 
| 379 | 
         
            +
               ],
         
     | 
| 380 | 
         
            +
               "source": [
         
     | 
| 381 | 
         
            +
                "raw_datasets[\"train\"].features"
         
     | 
| 382 | 
         
            +
               ]
         
     | 
| 383 | 
         
            +
              },
         
     | 
| 384 | 
         
            +
              {
         
     | 
| 385 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 386 | 
         
            +
               "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd",
         
     | 
| 387 | 
         
            +
               "metadata": {},
         
     | 
| 388 | 
         
            +
               "source": [
         
     | 
| 389 | 
         
            +
                "Since our input audio is sampled at 48kHz, we need to _downsample_ it to\n",
         
     | 
| 390 | 
         
            +
                "16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. \n",
         
     | 
| 391 | 
         
            +
                "\n",
         
     | 
| 392 | 
         
            +
                "We'll set the audio inputs to the correct sampling rate using dataset's \n",
         
     | 
| 393 | 
         
            +
                "[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)\n",
         
     | 
| 394 | 
         
            +
                "method. This operation does not change the audio in-place, \n",
         
     | 
| 395 | 
         
            +
                "but rather signals to `datasets` to resample audio samples _on the fly_ the \n",
         
     | 
| 396 | 
         
            +
                "first time that they are loaded:"
         
     | 
| 397 | 
         
            +
               ]
         
     | 
| 398 | 
         
            +
              },
         
     | 
| 399 | 
         
            +
              {
         
     | 
| 400 | 
         
            +
               "cell_type": "code",
         
     | 
| 401 | 
         
            +
               "execution_count": 17,
         
     | 
| 402 | 
         
            +
               "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
         
     | 
| 403 | 
         
            +
               "metadata": {},
         
     | 
| 404 | 
         
            +
               "outputs": [],
         
     | 
| 405 | 
         
            +
               "source": [
         
     | 
| 406 | 
         
            +
                "from datasets import Audio\n",
         
     | 
| 407 | 
         
            +
                "\n",
         
     | 
| 408 | 
         
            +
                "raw_datasets = raw_datasets.cast_column(\"audio\", Audio(sampling_rate=16000))"
         
     | 
| 409 | 
         
            +
               ]
         
     | 
| 410 | 
         
            +
              },
         
     | 
| 411 | 
         
            +
              {
         
     | 
| 412 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 413 | 
         
            +
               "id": "161322c2-94f3-4d26-9e1d-d9d5202ca3cf",
         
     | 
| 414 | 
         
            +
               "metadata": {},
         
     | 
| 415 | 
         
            +
               "source": [
         
     | 
| 416 | 
         
            +
                "We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
         
     | 
| 417 | 
         
            +
               ]
         
     | 
| 418 | 
         
            +
              },
         
     | 
| 419 | 
         
            +
              {
         
     | 
| 420 | 
         
            +
               "cell_type": "code",
         
     | 
| 421 | 
         
            +
               "execution_count": 18,
         
     | 
| 422 | 
         
            +
               "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
         
     | 
| 423 | 
         
            +
               "metadata": {},
         
     | 
| 424 | 
         
            +
               "outputs": [],
         
     | 
| 425 | 
         
            +
               "source": [
         
     | 
| 426 | 
         
            +
                "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
         
     | 
| 427 | 
         
            +
                "\n",
         
     | 
| 428 | 
         
            +
                "do_lower_case = False\n",
         
     | 
| 429 | 
         
            +
                "do_remove_punctuation = False\n",
         
     | 
| 430 | 
         
            +
                "\n",
         
     | 
| 431 | 
         
            +
                "normalizer = BasicTextNormalizer()"
         
     | 
| 432 | 
         
            +
               ]
         
     | 
| 433 | 
         
            +
              },
         
     | 
| 434 | 
         
            +
              {
         
     | 
| 435 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 436 | 
         
            +
               "id": "bfaa935b-a11d-497c-88c1-0c4d1bb3247b",
         
     | 
| 437 | 
         
            +
               "metadata": {},
         
     | 
| 438 | 
         
            +
               "source": [
         
     | 
| 439 | 
         
            +
                "Now we can write a function to prepare our data ready for the model:\n",
         
     | 
| 440 | 
         
            +
                "1. We load and resample the audio data by calling `batch[\"audio\"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.\n",
         
     | 
| 441 | 
         
            +
                "2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.\n",
         
     | 
| 442 | 
         
            +
                "3. We perform any optional pre-processing (lower-case or remove punctuation).\n",
         
     | 
| 443 | 
         
            +
                "4. We encode the transcriptions to label ids through the use of the tokenizer."
         
     | 
| 444 | 
         
            +
               ]
         
     | 
| 445 | 
         
            +
              },
         
     | 
| 446 | 
         
            +
              {
         
     | 
| 447 | 
         
            +
               "cell_type": "code",
         
     | 
| 448 | 
         
            +
               "execution_count": 19,
         
     | 
| 449 | 
         
            +
               "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
         
     | 
| 450 | 
         
            +
               "metadata": {},
         
     | 
| 451 | 
         
            +
               "outputs": [],
         
     | 
| 452 | 
         
            +
               "source": [
         
     | 
| 453 | 
         
            +
                "def prepare_dataset(batch):\n",
         
     | 
| 454 | 
         
            +
                "    # load and (possibly) resample audio data to 16kHz\n",
         
     | 
| 455 | 
         
            +
                "    audio = batch[\"audio\"]\n",
         
     | 
| 456 | 
         
            +
                "\n",
         
     | 
| 457 | 
         
            +
                "    # compute log-Mel input features from input audio array \n",
         
     | 
| 458 | 
         
            +
                "    batch[\"input_features\"] = processor.feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
         
     | 
| 459 | 
         
            +
                "    # compute input length of audio sample in seconds\n",
         
     | 
| 460 | 
         
            +
                "    batch[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
         
     | 
| 461 | 
         
            +
                "    \n",
         
     | 
| 462 | 
         
            +
                "    # optional pre-processing steps\n",
         
     | 
| 463 | 
         
            +
                "    transcription = batch[\"sentence\"]\n",
         
     | 
| 464 | 
         
            +
                "    if do_lower_case:\n",
         
     | 
| 465 | 
         
            +
                "        transcription = transcription.lower()\n",
         
     | 
| 466 | 
         
            +
                "    if do_remove_punctuation:\n",
         
     | 
| 467 | 
         
            +
                "        transcription = normalizer(transcription).strip()\n",
         
     | 
| 468 | 
         
            +
                "    \n",
         
     | 
| 469 | 
         
            +
                "    # encode target text to label ids\n",
         
     | 
| 470 | 
         
            +
                "    batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
         
     | 
| 471 | 
         
            +
                "    return batch"
         
     | 
| 472 | 
         
            +
               ]
         
     | 
| 473 | 
         
            +
              },
         
     | 
| 474 | 
         
            +
              {
         
     | 
| 475 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 476 | 
         
            +
               "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13",
         
     | 
| 477 | 
         
            +
               "metadata": {},
         
     | 
| 478 | 
         
            +
               "source": [
         
     | 
| 479 | 
         
            +
                "We can apply the data preparation function to all of our training examples using 🤗 Datasets' `.map` method. We'll remove all of the columns from the raw training data, leaving just the `input_features` and `labels` defined in the `prepare_dataset` function:"
         
     | 
| 480 | 
         
            +
               ]
         
     | 
| 481 | 
         
            +
              },
         
     | 
| 482 | 
         
            +
              {
         
     | 
| 483 | 
         
            +
               "cell_type": "code",
         
     | 
| 484 | 
         
            +
               "execution_count": 25,
         
     | 
| 485 | 
         
            +
               "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
         
     | 
| 486 | 
         
            +
               "metadata": {},
         
     | 
| 487 | 
         
            +
               "outputs": [],
         
     | 
| 488 | 
         
            +
               "source": [
         
     | 
| 489 | 
         
            +
                "vectorized_datasets = raw_datasets.map(prepare_dataset, remove_columns=list(next(iter(raw_datasets.values())).features)).with_format(\"torch\")"
         
     | 
| 490 | 
         
            +
               ]
         
     | 
| 491 | 
         
            +
              },
         
     | 
| 492 | 
         
            +
              {
         
     | 
| 493 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 494 | 
         
            +
               "id": "3d59b37e-4950-47ec-9e3e-2cf2ec7fc750",
         
     | 
| 495 | 
         
            +
               "metadata": {},
         
     | 
| 496 | 
         
            +
               "source": [
         
     | 
| 497 | 
         
            +
                "We can now define how we shuffle the data in the train split. The size of the subset we load is set by the variable `buffer_size`. You can increase or decrease this depending on your memory constraints. In this example, the `buffer_size` is set to 500, meaning 500 samples are loaded before shuffling across the subset. The larger we set this value, the closer to True offline shuffling. The `seed` is set for reproducibility:"
         
     | 
| 498 | 
         
            +
               ]
         
     | 
| 499 | 
         
            +
              },
         
     | 
| 500 | 
         
            +
              {
         
     | 
| 501 | 
         
            +
               "cell_type": "code",
         
     | 
| 502 | 
         
            +
               "execution_count": 26,
         
     | 
| 503 | 
         
            +
               "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
         
     | 
| 504 | 
         
            +
               "metadata": {},
         
     | 
| 505 | 
         
            +
               "outputs": [],
         
     | 
| 506 | 
         
            +
               "source": [
         
     | 
| 507 | 
         
            +
                "vectorized_datasets[\"train\"] = vectorized_datasets[\"train\"].shuffle(\n",
         
     | 
| 508 | 
         
            +
                "    buffer_size=500,\n",
         
     | 
| 509 | 
         
            +
                "    seed=0,\n",
         
     | 
| 510 | 
         
            +
                ")"
         
     | 
| 511 | 
         
            +
               ]
         
     | 
| 512 | 
         
            +
              },
         
     | 
| 513 | 
         
            +
              {
         
     | 
| 514 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 515 | 
         
            +
               "id": "666b9ef0-7909-4e1e-a419-87604d233e29",
         
     | 
| 516 | 
         
            +
               "metadata": {},
         
     | 
| 517 | 
         
            +
               "source": [
         
     | 
| 518 | 
         
            +
                "Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
         
     | 
| 519 | 
         
            +
               ]
         
     | 
| 520 | 
         
            +
              },
         
     | 
| 521 | 
         
            +
              {
         
     | 
| 522 | 
         
            +
               "cell_type": "code",
         
     | 
| 523 | 
         
            +
               "execution_count": 27,
         
     | 
| 524 | 
         
            +
               "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
         
     | 
| 525 | 
         
            +
               "metadata": {},
         
     | 
| 526 | 
         
            +
               "outputs": [],
         
     | 
| 527 | 
         
            +
               "source": [
         
     | 
| 528 | 
         
            +
                "max_input_length = 30.0\n",
         
     | 
| 529 | 
         
            +
                "\n",
         
     | 
| 530 | 
         
            +
                "def is_audio_in_length_range(length):\n",
         
     | 
| 531 | 
         
            +
                "    return length < max_input_length"
         
     | 
| 532 | 
         
            +
               ]
         
     | 
| 533 | 
         
            +
              },
         
     | 
| 534 | 
         
            +
              {
         
     | 
| 535 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 536 | 
         
            +
               "id": "28e37ac3-b1c5-465b-8586-7cfd8d76b0f1",
         
     | 
| 537 | 
         
            +
               "metadata": {},
         
     | 
| 538 | 
         
            +
               "source": [
         
     | 
| 539 | 
         
            +
                "We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
         
     | 
| 540 | 
         
            +
               ]
         
     | 
| 541 | 
         
            +
              },
         
     | 
| 542 | 
         
            +
              {
         
     | 
| 543 | 
         
            +
               "cell_type": "code",
         
     | 
| 544 | 
         
            +
               "execution_count": 28,
         
     | 
| 545 | 
         
            +
               "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
         
     | 
| 546 | 
         
            +
               "metadata": {},
         
     | 
| 547 | 
         
            +
               "outputs": [],
         
     | 
| 548 | 
         
            +
               "source": [
         
     | 
| 549 | 
         
            +
                "vectorized_datasets[\"train\"] = vectorized_datasets[\"train\"].filter(\n",
         
     | 
| 550 | 
         
            +
                "    is_audio_in_length_range,\n",
         
     | 
| 551 | 
         
            +
                "    input_columns=[\"input_length\"],\n",
         
     | 
| 552 | 
         
            +
                ")"
         
     | 
| 553 | 
         
            +
               ]
         
     | 
| 554 | 
         
            +
              },
         
     | 
| 555 | 
         
            +
              {
         
     | 
| 556 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 557 | 
         
            +
               "id": "263a5a58-0239-4a25-b0df-c625fc9c5810",
         
     | 
| 558 | 
         
            +
               "metadata": {},
         
     | 
| 559 | 
         
            +
               "source": [
         
     | 
| 560 | 
         
            +
                "## Training and Evaluation"
         
     | 
| 561 | 
         
            +
               ]
         
     | 
| 562 | 
         
            +
              },
         
     | 
| 563 | 
         
            +
              {
         
     | 
| 564 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 565 | 
         
            +
               "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7",
         
     | 
| 566 | 
         
            +
               "metadata": {},
         
     | 
| 567 | 
         
            +
               "source": [
         
     | 
| 568 | 
         
            +
                "Now that we've prepared our data, we're ready to dive into the training pipeline. \n",
         
     | 
| 569 | 
         
            +
                "The [🤗 Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer)\n",
         
     | 
| 570 | 
         
            +
                "will do much of the heavy lifting for us. All we have to do is:\n",
         
     | 
| 571 | 
         
            +
                "\n",
         
     | 
| 572 | 
         
            +
                "- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.\n",
         
     | 
| 573 | 
         
            +
                "\n",
         
     | 
| 574 | 
         
            +
                "- Evaluation metrics: during evaluation, we want to evaluate the model using the [word error rate (WER)](https://huggingface.co/metrics/wer) metric. We need to define a `compute_metrics` function that handles this computation.\n",
         
     | 
| 575 | 
         
            +
                "\n",
         
     | 
| 576 | 
         
            +
                "- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.\n",
         
     | 
| 577 | 
         
            +
                "\n",
         
     | 
| 578 | 
         
            +
                "- Define the training configuration: this will be used by the 🤗 Trainer to define the training schedule."
         
     | 
| 579 | 
         
            +
               ]
         
     | 
| 580 | 
         
            +
              },
         
     | 
| 581 | 
         
            +
              {
         
     | 
| 582 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 583 | 
         
            +
               "id": "8d230e6d-624c-400a-bbf5-fa660881df25",
         
     | 
| 584 | 
         
            +
               "metadata": {},
         
     | 
| 585 | 
         
            +
               "source": [
         
     | 
| 586 | 
         
            +
                "### Define a Data Collator"
         
     | 
| 587 | 
         
            +
               ]
         
     | 
| 588 | 
         
            +
              },
         
     | 
| 589 | 
         
            +
              {
         
     | 
| 590 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 591 | 
         
            +
               "id": "04def221-0637-4a69-b242-d3f0c1d0ee78",
         
     | 
| 592 | 
         
            +
               "metadata": {},
         
     | 
| 593 | 
         
            +
               "source": [
         
     | 
| 594 | 
         
            +
                "The data collator for a sequence-to-sequence speech model is unique in the sense that it \n",
         
     | 
| 595 | 
         
            +
                "treats the `input_features` and `labels` independently: the  `input_features` must be \n",
         
     | 
| 596 | 
         
            +
                "handled by the feature extractor and the `labels` by the tokenizer.\n",
         
     | 
| 597 | 
         
            +
                "\n",
         
     | 
| 598 | 
         
            +
                "The `input_features` are already padded to 30s and converted to a log-Mel spectrogram \n",
         
     | 
| 599 | 
         
            +
                "of fixed dimension by action of the feature extractor, so all we have to do is convert the `input_features`\n",
         
     | 
| 600 | 
         
            +
                "to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with `return_tensors=pt`.\n",
         
     | 
| 601 | 
         
            +
                "\n",
         
     | 
| 602 | 
         
            +
                "The `labels` on the other hand are un-padded. We first pad the sequences\n",
         
     | 
| 603 | 
         
            +
                "to the maximum length in the batch using the tokenizer's `.pad` method. The padding tokens \n",
         
     | 
| 604 | 
         
            +
                "are then replaced by `-100` so that these tokens are **not** taken into account when \n",
         
     | 
| 605 | 
         
            +
                "computing the loss. We then cut the BOS token from the start of the label sequence as we \n",
         
     | 
| 606 | 
         
            +
                "append it later during training.\n",
         
     | 
| 607 | 
         
            +
                "\n",
         
     | 
| 608 | 
         
            +
                "We can leverage the `WhisperProcessor` we defined earlier to perform both the \n",
         
     | 
| 609 | 
         
            +
                "feature extractor and the tokenizer operations:"
         
     | 
| 610 | 
         
            +
               ]
         
     | 
| 611 | 
         
            +
              },
         
     | 
| 612 | 
         
            +
              {
         
     | 
| 613 | 
         
            +
               "cell_type": "code",
         
     | 
| 614 | 
         
            +
               "execution_count": 29,
         
     | 
| 615 | 
         
            +
               "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
         
     | 
| 616 | 
         
            +
               "metadata": {},
         
     | 
| 617 | 
         
            +
               "outputs": [],
         
     | 
| 618 | 
         
            +
               "source": [
         
     | 
| 619 | 
         
            +
                "import torch\n",
         
     | 
| 620 | 
         
            +
                "\n",
         
     | 
| 621 | 
         
            +
                "from dataclasses import dataclass\n",
         
     | 
| 622 | 
         
            +
                "from typing import Any, Dict, List, Union\n",
         
     | 
| 623 | 
         
            +
                "\n",
         
     | 
| 624 | 
         
            +
                "@dataclass\n",
         
     | 
| 625 | 
         
            +
                "class DataCollatorSpeechSeq2SeqWithPadding:\n",
         
     | 
| 626 | 
         
            +
                "    processor: Any\n",
         
     | 
| 627 | 
         
            +
                "\n",
         
     | 
| 628 | 
         
            +
                "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
         
     | 
| 629 | 
         
            +
                "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
         
     | 
| 630 | 
         
            +
                "        # first treat the audio inputs by simply returning torch tensors\n",
         
     | 
| 631 | 
         
            +
                "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
         
     | 
| 632 | 
         
            +
                "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
         
     | 
| 633 | 
         
            +
                "\n",
         
     | 
| 634 | 
         
            +
                "        # get the tokenized label sequences\n",
         
     | 
| 635 | 
         
            +
                "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
         
     | 
| 636 | 
         
            +
                "        # pad the labels to max length\n",
         
     | 
| 637 | 
         
            +
                "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
         
     | 
| 638 | 
         
            +
                "\n",
         
     | 
| 639 | 
         
            +
                "        # replace padding with -100 to ignore loss correctly\n",
         
     | 
| 640 | 
         
            +
                "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
         
     | 
| 641 | 
         
            +
                "\n",
         
     | 
| 642 | 
         
            +
                "        # if bos token is appended in previous tokenization step,\n",
         
     | 
| 643 | 
         
            +
                "        # cut bos token here as it's append later anyways\n",
         
     | 
| 644 | 
         
            +
                "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
         
     | 
| 645 | 
         
            +
                "            labels = labels[:, 1:]\n",
         
     | 
| 646 | 
         
            +
                "\n",
         
     | 
| 647 | 
         
            +
                "        batch[\"labels\"] = labels\n",
         
     | 
| 648 | 
         
            +
                "\n",
         
     | 
| 649 | 
         
            +
                "        return batch"
         
     | 
| 650 | 
         
            +
               ]
         
     | 
| 651 | 
         
            +
              },
         
     | 
| 652 | 
         
            +
              {
         
     | 
| 653 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 654 | 
         
            +
               "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86",
         
     | 
| 655 | 
         
            +
               "metadata": {},
         
     | 
| 656 | 
         
            +
               "source": [
         
     | 
| 657 | 
         
            +
                "Let's initialise the data collator we've just defined:"
         
     | 
| 658 | 
         
            +
               ]
         
     | 
| 659 | 
         
            +
              },
         
     | 
| 660 | 
         
            +
              {
         
     | 
| 661 | 
         
            +
               "cell_type": "code",
         
     | 
| 662 | 
         
            +
               "execution_count": 30,
         
     | 
| 663 | 
         
            +
               "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
         
     | 
| 664 | 
         
            +
               "metadata": {},
         
     | 
| 665 | 
         
            +
               "outputs": [],
         
     | 
| 666 | 
         
            +
               "source": [
         
     | 
| 667 | 
         
            +
                "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
         
     | 
| 668 | 
         
            +
               ]
         
     | 
| 669 | 
         
            +
              },
         
     | 
| 670 | 
         
            +
              {
         
     | 
| 671 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 672 | 
         
            +
               "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698",
         
     | 
| 673 | 
         
            +
               "metadata": {},
         
     | 
| 674 | 
         
            +
               "source": [
         
     | 
| 675 | 
         
            +
                "### Evaluation Metrics"
         
     | 
| 676 | 
         
            +
               ]
         
     | 
| 677 | 
         
            +
              },
         
     | 
| 678 | 
         
            +
              {
         
     | 
| 679 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 680 | 
         
            +
               "id": "66fee1a7-a44c-461e-b047-c3917221572e",
         
     | 
| 681 | 
         
            +
               "metadata": {},
         
     | 
| 682 | 
         
            +
               "source": [
         
     | 
| 683 | 
         
            +
                "We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing \n",
         
     | 
| 684 | 
         
            +
                "ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:"
         
     | 
| 685 | 
         
            +
               ]
         
     | 
| 686 | 
         
            +
              },
         
     | 
| 687 | 
         
            +
              {
         
     | 
| 688 | 
         
            +
               "cell_type": "code",
         
     | 
| 689 | 
         
            +
               "execution_count": 31,
         
     | 
| 690 | 
         
            +
               "id": "b22b4011-f31f-4b57-b684-c52332f92890",
         
     | 
| 691 | 
         
            +
               "metadata": {},
         
     | 
| 692 | 
         
            +
               "outputs": [],
         
     | 
| 693 | 
         
            +
               "source": [
         
     | 
| 694 | 
         
            +
                "import evaluate\n",
         
     | 
| 695 | 
         
            +
                "\n",
         
     | 
| 696 | 
         
            +
                "metric = evaluate.load(\"wer\")"
         
     | 
| 697 | 
         
            +
               ]
         
     | 
| 698 | 
         
            +
              },
         
     | 
| 699 | 
         
            +
              {
         
     | 
| 700 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 701 | 
         
            +
               "id": "509f96d7-3f11-4f37-add9-f74a0c44f3fc",
         
     | 
| 702 | 
         
            +
               "metadata": {},
         
     | 
| 703 | 
         
            +
               "source": [
         
     | 
| 704 | 
         
            +
                "We then simply have to define a function that takes our model \n",
         
     | 
| 705 | 
         
            +
                "predictions and returns the WER metric. This function, called\n",
         
     | 
| 706 | 
         
            +
                "`compute_metrics`, first replaces `-100` with the `pad_token_id`\n",
         
     | 
| 707 | 
         
            +
                "in the `label_ids` (undoing the step we applied in the \n",
         
     | 
| 708 | 
         
            +
                "data collator to ignore padded tokens correctly in the loss).\n",
         
     | 
| 709 | 
         
            +
                "It then decodes the predicted and label ids to strings. Finally,\n",
         
     | 
| 710 | 
         
            +
                "it computes the WER between the predictions and reference labels. \n",
         
     | 
| 711 | 
         
            +
                "Here, we have the option of evaluating with the 'normalised' transcriptions \n",
         
     | 
| 712 | 
         
            +
                "and predictions. We recommend you set this to `True` to benefit from the WER \n",
         
     | 
| 713 | 
         
            +
                "improvement obtained by normalising the transcriptions."
         
     | 
| 714 | 
         
            +
               ]
         
     | 
| 715 | 
         
            +
              },
         
     | 
| 716 | 
         
            +
              {
         
     | 
| 717 | 
         
            +
               "cell_type": "code",
         
     | 
| 718 | 
         
            +
               "execution_count": 32,
         
     | 
| 719 | 
         
            +
               "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
         
     | 
| 720 | 
         
            +
               "metadata": {},
         
     | 
| 721 | 
         
            +
               "outputs": [],
         
     | 
| 722 | 
         
            +
               "source": [
         
     | 
| 723 | 
         
            +
                "# evaluate with the 'normalised' WER\n",
         
     | 
| 724 | 
         
            +
                "do_normalize_eval = True\n",
         
     | 
| 725 | 
         
            +
                "\n",
         
     | 
| 726 | 
         
            +
                "def compute_metrics(pred):\n",
         
     | 
| 727 | 
         
            +
                "    pred_ids = pred.predictions\n",
         
     | 
| 728 | 
         
            +
                "    label_ids = pred.label_ids\n",
         
     | 
| 729 | 
         
            +
                "\n",
         
     | 
| 730 | 
         
            +
                "    # replace -100 with the pad_token_id\n",
         
     | 
| 731 | 
         
            +
                "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
         
     | 
| 732 | 
         
            +
                "\n",
         
     | 
| 733 | 
         
            +
                "    # we do not want to group tokens when computing the metrics\n",
         
     | 
| 734 | 
         
            +
                "    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
         
     | 
| 735 | 
         
            +
                "    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
         
     | 
| 736 | 
         
            +
                "\n",
         
     | 
| 737 | 
         
            +
                "    if do_normalize_eval:\n",
         
     | 
| 738 | 
         
            +
                "        pred_str = [normalizer(pred) for pred in pred_str]\n",
         
     | 
| 739 | 
         
            +
                "        label_str = [normalizer(label) for label in label_str]\n",
         
     | 
| 740 | 
         
            +
                "        # filtering step to only evaluate the samples that correspond to non-zero references:\n",
         
     | 
| 741 | 
         
            +
                "        pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0]\n",
         
     | 
| 742 | 
         
            +
                "        label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0]\n",
         
     | 
| 743 | 
         
            +
                "\n",
         
     | 
| 744 | 
         
            +
                "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
         
     | 
| 745 | 
         
            +
                "\n",
         
     | 
| 746 | 
         
            +
                "    return {\"wer\": wer}"
         
     | 
| 747 | 
         
            +
               ]
         
     | 
| 748 | 
         
            +
              },
         
     | 
| 749 | 
         
            +
              {
         
     | 
| 750 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 751 | 
         
            +
               "id": "daf2a825-6d9f-4a23-b145-c37c0039075b",
         
     | 
| 752 | 
         
            +
               "metadata": {},
         
     | 
| 753 | 
         
            +
               "source": [
         
     | 
| 754 | 
         
            +
                "### Load a Pre-Trained Checkpoint"
         
     | 
| 755 | 
         
            +
               ]
         
     | 
| 756 | 
         
            +
              },
         
     | 
| 757 | 
         
            +
              {
         
     | 
| 758 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 759 | 
         
            +
               "id": "437a97fa-4864-476b-8abc-f28b8166cfa5",
         
     | 
| 760 | 
         
            +
               "metadata": {},
         
     | 
| 761 | 
         
            +
               "source": [
         
     | 
| 762 | 
         
            +
                "Now let's load the pre-trained Whisper `small` checkpoint. Again, this \n",
         
     | 
| 763 | 
         
            +
                "is trivial through use of 🤗 Transformers!"
         
     | 
| 764 | 
         
            +
               ]
         
     | 
| 765 | 
         
            +
              },
         
     | 
| 766 | 
         
            +
              {
         
     | 
| 767 | 
         
            +
               "cell_type": "code",
         
     | 
| 768 | 
         
            +
               "execution_count": 33,
         
     | 
| 769 | 
         
            +
               "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
         
     | 
| 770 | 
         
            +
               "metadata": {},
         
     | 
| 771 | 
         
            +
               "outputs": [],
         
     | 
| 772 | 
         
            +
               "source": [
         
     | 
| 773 | 
         
            +
                "from transformers import WhisperForConditionalGeneration\n",
         
     | 
| 774 | 
         
            +
                "\n",
         
     | 
| 775 | 
         
            +
                "model = WhisperForConditionalGeneration.from_pretrained(\"juancopi81/whisper-medium-es\")"
         
     | 
| 776 | 
         
            +
               ]
         
     | 
| 777 | 
         
            +
              },
         
     | 
| 778 | 
         
            +
              {
         
     | 
| 779 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 780 | 
         
            +
               "id": "a15ead5f-2277-4a39-937b-585c2497b2df",
         
     | 
| 781 | 
         
            +
               "metadata": {},
         
     | 
| 782 | 
         
            +
               "source": [
         
     | 
| 783 | 
         
            +
                "Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set `use_cache` to False since we're using gradient checkpointing, and the two are incompatible:"
         
     | 
| 784 | 
         
            +
               ]
         
     | 
| 785 | 
         
            +
              },
         
     | 
| 786 | 
         
            +
              {
         
     | 
| 787 | 
         
            +
               "cell_type": "code",
         
     | 
| 788 | 
         
            +
               "execution_count": 34,
         
     | 
| 789 | 
         
            +
               "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
         
     | 
| 790 | 
         
            +
               "metadata": {},
         
     | 
| 791 | 
         
            +
               "outputs": [],
         
     | 
| 792 | 
         
            +
               "source": [
         
     | 
| 793 | 
         
            +
                "model.config.forced_decoder_ids = None\n",
         
     | 
| 794 | 
         
            +
                "model.config.suppress_tokens = []\n",
         
     | 
| 795 | 
         
            +
                "model.config.use_cache = False\n",
         
     | 
| 796 | 
         
            +
                "model.config.dropout = 0.1"
         
     | 
| 797 | 
         
            +
               ]
         
     | 
| 798 | 
         
            +
              },
         
     | 
| 799 | 
         
            +
              {
         
     | 
| 800 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 801 | 
         
            +
               "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06",
         
     | 
| 802 | 
         
            +
               "metadata": {},
         
     | 
| 803 | 
         
            +
               "source": [
         
     | 
| 804 | 
         
            +
                "### Define the Training Configuration"
         
     | 
| 805 | 
         
            +
               ]
         
     | 
| 806 | 
         
            +
              },
         
     | 
| 807 | 
         
            +
              {
         
     | 
| 808 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 809 | 
         
            +
               "id": "c21af1e9-0188-4134-ac82-defc7bdcc436",
         
     | 
| 810 | 
         
            +
               "metadata": {},
         
     | 
| 811 | 
         
            +
               "source": [
         
     | 
| 812 | 
         
            +
                "In the final step, we define all the parameters related to training. Here, you can set the `max_steps` to train for longer. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)."
         
     | 
| 813 | 
         
            +
               ]
         
     | 
| 814 | 
         
            +
              },
         
     | 
| 815 | 
         
            +
              {
         
     | 
| 816 | 
         
            +
               "cell_type": "code",
         
     | 
| 817 | 
         
            +
               "execution_count": 35,
         
     | 
| 818 | 
         
            +
               "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
         
     | 
| 819 | 
         
            +
               "metadata": {},
         
     | 
| 820 | 
         
            +
               "outputs": [],
         
     | 
| 821 | 
         
            +
               "source": [
         
     | 
| 822 | 
         
            +
                "from transformers import Seq2SeqTrainingArguments\n",
         
     | 
| 823 | 
         
            +
                "\n",
         
     | 
| 824 | 
         
            +
                "training_args = Seq2SeqTrainingArguments(\n",
         
     | 
| 825 | 
         
            +
                "    output_dir=\"./\",\n",
         
     | 
| 826 | 
         
            +
                "    per_device_train_batch_size=32,\n",
         
     | 
| 827 | 
         
            +
                "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
         
     | 
| 828 | 
         
            +
                "    learning_rate=1e-5,\n",
         
     | 
| 829 | 
         
            +
                "    warmup_steps=500,\n",
         
     | 
| 830 | 
         
            +
                "    max_steps=5000,\n",
         
     | 
| 831 | 
         
            +
                "    gradient_checkpointing=True,\n",
         
     | 
| 832 | 
         
            +
                "    fp16=True,\n",
         
     | 
| 833 | 
         
            +
                "    evaluation_strategy=\"steps\",\n",
         
     | 
| 834 | 
         
            +
                "    per_device_eval_batch_size=16,\n",
         
     | 
| 835 | 
         
            +
                "    predict_with_generate=True,\n",
         
     | 
| 836 | 
         
            +
                "    generation_max_length=225,\n",
         
     | 
| 837 | 
         
            +
                "    save_steps=1000,\n",
         
     | 
| 838 | 
         
            +
                "    eval_steps=1000,\n",
         
     | 
| 839 | 
         
            +
                "    logging_steps=25,\n",
         
     | 
| 840 | 
         
            +
                "    report_to=[\"tensorboard\"],\n",
         
     | 
| 841 | 
         
            +
                "    load_best_model_at_end=True,\n",
         
     | 
| 842 | 
         
            +
                "    metric_for_best_model=\"wer\",\n",
         
     | 
| 843 | 
         
            +
                "    greater_is_better=False,\n",
         
     | 
| 844 | 
         
            +
                "    push_to_hub=True,\n",
         
     | 
| 845 | 
         
            +
                ")"
         
     | 
| 846 | 
         
            +
               ]
         
     | 
| 847 | 
         
            +
              },
         
     | 
| 848 | 
         
            +
              {
         
     | 
| 849 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 850 | 
         
            +
               "id": "b3a944d8-3112-4552-82a0-be25988b3857",
         
     | 
| 851 | 
         
            +
               "metadata": {},
         
     | 
| 852 | 
         
            +
               "source": [
         
     | 
| 853 | 
         
            +
                "**Note**: if one does not want to upload the model checkpoints to the Hub, \n",
         
     | 
| 854 | 
         
            +
                "set `push_to_hub=False`."
         
     | 
| 855 | 
         
            +
               ]
         
     | 
| 856 | 
         
            +
              },
         
     | 
| 857 | 
         
            +
              {
         
     | 
| 858 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 859 | 
         
            +
               "id": "393c883e-3e50-492c-bd58-f51dbf15ee56",
         
     | 
| 860 | 
         
            +
               "metadata": {},
         
     | 
| 861 | 
         
            +
               "source": [
         
     | 
| 862 | 
         
            +
                "We then define a custom [Callback](https://huggingface.co/docs/transformers/main_classes/callback) that is called by the 🤗 Trainer on the end of each epoch. The Callback reinitialises and reshuffles the streaming dataset at the beginning of each new epoch - this gives different shuffling across our subsets for every epoch."
         
     | 
| 863 | 
         
            +
               ]
         
     | 
| 864 | 
         
            +
              },
         
     | 
| 865 | 
         
            +
              {
         
     | 
| 866 | 
         
            +
               "cell_type": "code",
         
     | 
| 867 | 
         
            +
               "execution_count": 36,
         
     | 
| 868 | 
         
            +
               "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
         
     | 
| 869 | 
         
            +
               "metadata": {},
         
     | 
| 870 | 
         
            +
               "outputs": [],
         
     | 
| 871 | 
         
            +
               "source": [
         
     | 
| 872 | 
         
            +
                "from transformers import TrainerCallback\n",
         
     | 
| 873 | 
         
            +
                "from transformers.trainer_pt_utils import IterableDatasetShard\n",
         
     | 
| 874 | 
         
            +
                "from torch.utils.data import IterableDataset\n",
         
     | 
| 875 | 
         
            +
                "\n",
         
     | 
| 876 | 
         
            +
                "# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch\n",
         
     | 
| 877 | 
         
            +
                "class ShuffleCallback(TrainerCallback):\n",
         
     | 
| 878 | 
         
            +
                "    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):\n",
         
     | 
| 879 | 
         
            +
                "        if isinstance(train_dataloader.dataset, IterableDatasetShard):\n",
         
     | 
| 880 | 
         
            +
                "            pass  # set_epoch() is handled by the Trainer\n",
         
     | 
| 881 | 
         
            +
                "        elif isinstance(train_dataloader.dataset, IterableDataset):\n",
         
     | 
| 882 | 
         
            +
                "            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)"
         
     | 
| 883 | 
         
            +
               ]
         
     | 
| 884 | 
         
            +
              },
         
     | 
| 885 | 
         
            +
              {
         
     | 
| 886 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 887 | 
         
            +
               "id": "bac29114-d226-4f54-97cf-8718c9f94e1e",
         
     | 
| 888 | 
         
            +
               "metadata": {},
         
     | 
| 889 | 
         
            +
               "source": [
         
     | 
| 890 | 
         
            +
                "We can forward the training arguments to the 🤗 Trainer along with our model,\n",
         
     | 
| 891 | 
         
            +
                "dataset, data collator, `compute_metrics` function and custom callback:"
         
     | 
| 892 | 
         
            +
               ]
         
     | 
| 893 | 
         
            +
              },
         
     | 
| 894 | 
         
            +
              {
         
     | 
| 895 | 
         
            +
               "cell_type": "code",
         
     | 
| 896 | 
         
            +
               "execution_count": 37,
         
     | 
| 897 | 
         
            +
               "id": "d546d7fe-0543-479a-b708-2ebabec19493",
         
     | 
| 898 | 
         
            +
               "metadata": {},
         
     | 
| 899 | 
         
            +
               "outputs": [
         
     | 
| 900 | 
         
            +
                {
         
     | 
| 901 | 
         
            +
                 "name": "stderr",
         
     | 
| 902 | 
         
            +
                 "output_type": "stream",
         
     | 
| 903 | 
         
            +
                 "text": [
         
     | 
| 904 | 
         
            +
                  "/home/ubuntu/whisper-small-es-common-fleurs/./ is already a clone of https://huggingface.co/juancopi81/whisper-small-es-common-fleurs. Make sure you pull the latest changes with `repo.git_pull()`.\n",
         
     | 
| 905 | 
         
            +
                  "max_steps is given, it will override any value given in num_train_epochs\n",
         
     | 
| 906 | 
         
            +
                  "Using cuda_amp half precision backend\n"
         
     | 
| 907 | 
         
            +
                 ]
         
     | 
| 908 | 
         
            +
                }
         
     | 
| 909 | 
         
            +
               ],
         
     | 
| 910 | 
         
            +
               "source": [
         
     | 
| 911 | 
         
            +
                "from transformers import Seq2SeqTrainer\n",
         
     | 
| 912 | 
         
            +
                "\n",
         
     | 
| 913 | 
         
            +
                "trainer = Seq2SeqTrainer(\n",
         
     | 
| 914 | 
         
            +
                "    args=training_args,\n",
         
     | 
| 915 | 
         
            +
                "    model=model,\n",
         
     | 
| 916 | 
         
            +
                "    train_dataset=vectorized_datasets[\"train\"],\n",
         
     | 
| 917 | 
         
            +
                "    eval_dataset=vectorized_datasets[\"test\"],\n",
         
     | 
| 918 | 
         
            +
                "    data_collator=data_collator,\n",
         
     | 
| 919 | 
         
            +
                "    compute_metrics=compute_metrics,\n",
         
     | 
| 920 | 
         
            +
                "    tokenizer=processor,\n",
         
     | 
| 921 | 
         
            +
                "    callbacks=[ShuffleCallback()],\n",
         
     | 
| 922 | 
         
            +
                ")"
         
     | 
| 923 | 
         
            +
               ]
         
     | 
| 924 | 
         
            +
              },
         
     | 
| 925 | 
         
            +
              {
         
     | 
| 926 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 927 | 
         
            +
               "id": "67ab88c3-7091-4e51-8ad5-f5cacbe18449",
         
     | 
| 928 | 
         
            +
               "metadata": {},
         
     | 
| 929 | 
         
            +
               "source": [
         
     | 
| 930 | 
         
            +
                "We'll save the model and processor to the output directory before training:"
         
     | 
| 931 | 
         
            +
               ]
         
     | 
| 932 | 
         
            +
              },
         
     | 
| 933 | 
         
            +
              {
         
     | 
| 934 | 
         
            +
               "cell_type": "code",
         
     | 
| 935 | 
         
            +
               "execution_count": 38,
         
     | 
| 936 | 
         
            +
               "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
         
     | 
| 937 | 
         
            +
               "metadata": {},
         
     | 
| 938 | 
         
            +
               "outputs": [
         
     | 
| 939 | 
         
            +
                {
         
     | 
| 940 | 
         
            +
                 "name": "stderr",
         
     | 
| 941 | 
         
            +
                 "output_type": "stream",
         
     | 
| 942 | 
         
            +
                 "text": [
         
     | 
| 943 | 
         
            +
                  "Configuration saved in ./config.json\n",
         
     | 
| 944 | 
         
            +
                  "Model weights saved in ./pytorch_model.bin\n",
         
     | 
| 945 | 
         
            +
                  "Feature extractor saved in ./preprocessor_config.json\n",
         
     | 
| 946 | 
         
            +
                  "tokenizer config file saved in ./tokenizer_config.json\n",
         
     | 
| 947 | 
         
            +
                  "Special tokens file saved in ./special_tokens_map.json\n",
         
     | 
| 948 | 
         
            +
                  "added tokens file saved in ./added_tokens.json\n"
         
     | 
| 949 | 
         
            +
                 ]
         
     | 
| 950 | 
         
            +
                }
         
     | 
| 951 | 
         
            +
               ],
         
     | 
| 952 | 
         
            +
               "source": [
         
     | 
| 953 | 
         
            +
                "model.save_pretrained(training_args.output_dir)\n",
         
     | 
| 954 | 
         
            +
                "processor.save_pretrained(training_args.output_dir)"
         
     | 
| 955 | 
         
            +
               ]
         
     | 
| 956 | 
         
            +
              },
         
     | 
| 957 | 
         
            +
              {
         
     | 
| 958 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 959 | 
         
            +
               "id": "7f404cf9-4345-468c-8196-4bd101d9bd51",
         
     | 
| 960 | 
         
            +
               "metadata": {},
         
     | 
| 961 | 
         
            +
               "source": [
         
     | 
| 962 | 
         
            +
                "### Training"
         
     | 
| 963 | 
         
            +
               ]
         
     | 
| 964 | 
         
            +
              },
         
     | 
| 965 | 
         
            +
              {
         
     | 
| 966 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 967 | 
         
            +
               "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112",
         
     | 
| 968 | 
         
            +
               "metadata": {},
         
     | 
| 969 | 
         
            +
               "source": [
         
     | 
| 970 | 
         
            +
                "Training will take approximately 5-10 hours depending on your GPU. The peak GPU memory for the given training configuration is approximately 36GB. \n",
         
     | 
| 971 | 
         
            +
                "Depending on your GPU, it is possible that you will encounter a CUDA `\"out-of-memory\"` error when you launch training. \n",
         
     | 
| 972 | 
         
            +
                "In this case, you can reduce the `per_device_train_batch_size` incrementally by factors of 2 \n",
         
     | 
| 973 | 
         
            +
                "and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps)\n",
         
     | 
| 974 | 
         
            +
                "to compensate.\n",
         
     | 
| 975 | 
         
            +
                "\n",
         
     | 
| 976 | 
         
            +
                "To launch training, simply execute:"
         
     | 
| 977 | 
         
            +
               ]
         
     | 
| 978 | 
         
            +
              },
         
     | 
| 979 | 
         
            +
              {
         
     | 
| 980 | 
         
            +
               "cell_type": "code",
         
     | 
| 981 | 
         
            +
               "execution_count": 39,
         
     | 
| 982 | 
         
            +
               "id": "ced90915-84df-4538-9034-f6c8c85de2df",
         
     | 
| 983 | 
         
            +
               "metadata": {},
         
     | 
| 984 | 
         
            +
               "outputs": [
         
     | 
| 985 | 
         
            +
                {
         
     | 
| 986 | 
         
            +
                 "data": {
         
     | 
| 987 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 988 | 
         
            +
                   "model_id": "2e4f6ccd07d344d08259008b7485b7db",
         
     | 
| 989 | 
         
            +
                   "version_major": 2,
         
     | 
| 990 | 
         
            +
                   "version_minor": 0
         
     | 
| 991 | 
         
            +
                  },
         
     | 
| 992 | 
         
            +
                  "text/plain": [
         
     | 
| 993 | 
         
            +
                   "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
         
     | 
| 994 | 
         
            +
                  ]
         
     | 
| 995 | 
         
            +
                 },
         
     | 
| 996 | 
         
            +
                 "metadata": {},
         
     | 
| 997 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 998 | 
         
            +
                }
         
     | 
| 999 | 
         
            +
               ],
         
     | 
| 1000 | 
         
            +
               "source": [
         
     | 
| 1001 | 
         
            +
                "from huggingface_hub import notebook_login\n",
         
     | 
| 1002 | 
         
            +
                "\n",
         
     | 
| 1003 | 
         
            +
                "notebook_login()"
         
     | 
| 1004 | 
         
            +
               ]
         
     | 
| 1005 | 
         
            +
              },
         
     | 
| 1006 | 
         
            +
              {
         
     | 
| 1007 | 
         
            +
               "cell_type": "code",
         
     | 
| 1008 | 
         
            +
               "execution_count": null,
         
     | 
| 1009 | 
         
            +
               "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
         
     | 
| 1010 | 
         
            +
               "metadata": {},
         
     | 
| 1011 | 
         
            +
               "outputs": [
         
     | 
| 1012 | 
         
            +
                {
         
     | 
| 1013 | 
         
            +
                 "name": "stderr",
         
     | 
| 1014 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1015 | 
         
            +
                 "text": [
         
     | 
| 1016 | 
         
            +
                  "/home/ubuntu/hf_env/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
         
     | 
| 1017 | 
         
            +
                  "  warnings.warn(\n",
         
     | 
| 1018 | 
         
            +
                  "***** Running training *****\n",
         
     | 
| 1019 | 
         
            +
                  "  Num examples = 160000\n",
         
     | 
| 1020 | 
         
            +
                  "  Num Epochs = 9223372036854775807\n",
         
     | 
| 1021 | 
         
            +
                  "  Instantaneous batch size per device = 32\n",
         
     | 
| 1022 | 
         
            +
                  "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
         
     | 
| 1023 | 
         
            +
                  "  Gradient Accumulation steps = 1\n",
         
     | 
| 1024 | 
         
            +
                  "  Total optimization steps = 5000\n",
         
     | 
| 1025 | 
         
            +
                  "  Number of trainable parameters = 763857920\n",
         
     | 
| 1026 | 
         
            +
                  "Reading metadata...: 230467it [00:04, 49083.73it/s]\n",
         
     | 
| 1027 | 
         
            +
                  "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
         
     | 
| 1028 | 
         
            +
                 ]
         
     | 
| 1029 | 
         
            +
                },
         
     | 
| 1030 | 
         
            +
                {
         
     | 
| 1031 | 
         
            +
                 "data": {
         
     | 
| 1032 | 
         
            +
                  "text/html": [
         
     | 
| 1033 | 
         
            +
                   "\n",
         
     | 
| 1034 | 
         
            +
                   "    <div>\n",
         
     | 
| 1035 | 
         
            +
                   "      \n",
         
     | 
| 1036 | 
         
            +
                   "      <progress value='3001' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
         
     | 
| 1037 | 
         
            +
                   "      [3001/5000 10:24:07 < 6:56:01, 0.08 it/s, Epoch 0.60/9223372036854775807]\n",
         
     | 
| 1038 | 
         
            +
                   "    </div>\n",
         
     | 
| 1039 | 
         
            +
                   "    <table border=\"1\" class=\"dataframe\">\n",
         
     | 
| 1040 | 
         
            +
                   "  <thead>\n",
         
     | 
| 1041 | 
         
            +
                   " <tr style=\"text-align: left;\">\n",
         
     | 
| 1042 | 
         
            +
                   "      <th>Step</th>\n",
         
     | 
| 1043 | 
         
            +
                   "      <th>Training Loss</th>\n",
         
     | 
| 1044 | 
         
            +
                   "      <th>Validation Loss</th>\n",
         
     | 
| 1045 | 
         
            +
                   "      <th>Wer</th>\n",
         
     | 
| 1046 | 
         
            +
                   "    </tr>\n",
         
     | 
| 1047 | 
         
            +
                   "  </thead>\n",
         
     | 
| 1048 | 
         
            +
                   "  <tbody>\n",
         
     | 
| 1049 | 
         
            +
                   "    <tr>\n",
         
     | 
| 1050 | 
         
            +
                   "      <td>1000</td>\n",
         
     | 
| 1051 | 
         
            +
                   "      <td>0.069400</td>\n",
         
     | 
| 1052 | 
         
            +
                   "      <td>0.219434</td>\n",
         
     | 
| 1053 | 
         
            +
                   "      <td>6.819422</td>\n",
         
     | 
| 1054 | 
         
            +
                   "    </tr>\n",
         
     | 
| 1055 | 
         
            +
                   "    <tr>\n",
         
     | 
| 1056 | 
         
            +
                   "      <td>2000</td>\n",
         
     | 
| 1057 | 
         
            +
                   "      <td>0.033600</td>\n",
         
     | 
| 1058 | 
         
            +
                   "      <td>0.209724</td>\n",
         
     | 
| 1059 | 
         
            +
                   "      <td>6.755756</td>\n",
         
     | 
| 1060 | 
         
            +
                   "    </tr>\n",
         
     | 
| 1061 | 
         
            +
                   "  </tbody>\n",
         
     | 
| 1062 | 
         
            +
                   "</table><p>"
         
     | 
| 1063 | 
         
            +
                  ],
         
     | 
| 1064 | 
         
            +
                  "text/plain": [
         
     | 
| 1065 | 
         
            +
                   "<IPython.core.display.HTML object>"
         
     | 
| 1066 | 
         
            +
                  ]
         
     | 
| 1067 | 
         
            +
                 },
         
     | 
| 1068 | 
         
            +
                 "metadata": {},
         
     | 
| 1069 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 1070 | 
         
            +
                },
         
     | 
| 1071 | 
         
            +
                {
         
     | 
| 1072 | 
         
            +
                 "name": "stderr",
         
     | 
| 1073 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1074 | 
         
            +
                 "text": [
         
     | 
| 1075 | 
         
            +
                  "***** Running Evaluation *****\n",
         
     | 
| 1076 | 
         
            +
                  "  Num examples: Unknown\n",
         
     | 
| 1077 | 
         
            +
                  "  Batch size = 16\n",
         
     | 
| 1078 | 
         
            +
                  "Reading metadata...: 15520it [00:00, 42402.78it/s]\n",
         
     | 
| 1079 | 
         
            +
                  "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n",
         
     | 
| 1080 | 
         
            +
                  "Saving model checkpoint to ./checkpoint-1000\n",
         
     | 
| 1081 | 
         
            +
                  "Configuration saved in ./checkpoint-1000/config.json\n",
         
     | 
| 1082 | 
         
            +
                  "Model weights saved in ./checkpoint-1000/pytorch_model.bin\n",
         
     | 
| 1083 | 
         
            +
                  "Feature extractor saved in ./checkpoint-1000/preprocessor_config.json\n",
         
     | 
| 1084 | 
         
            +
                  "tokenizer config file saved in ./checkpoint-1000/tokenizer_config.json\n",
         
     | 
| 1085 | 
         
            +
                  "Special tokens file saved in ./checkpoint-1000/special_tokens_map.json\n",
         
     | 
| 1086 | 
         
            +
                  "added tokens file saved in ./checkpoint-1000/added_tokens.json\n",
         
     | 
| 1087 | 
         
            +
                  "Feature extractor saved in ./preprocessor_config.json\n",
         
     | 
| 1088 | 
         
            +
                  "tokenizer config file saved in ./tokenizer_config.json\n",
         
     | 
| 1089 | 
         
            +
                  "Special tokens file saved in ./special_tokens_map.json\n",
         
     | 
| 1090 | 
         
            +
                  "added tokens file saved in ./added_tokens.json\n",
         
     | 
| 1091 | 
         
            +
                  "***** Running Evaluation *****\n",
         
     | 
| 1092 | 
         
            +
                  "  Num examples: Unknown\n",
         
     | 
| 1093 | 
         
            +
                  "  Batch size = 16\n",
         
     | 
| 1094 | 
         
            +
                  "Reading metadata...: 15520it [00:00, 27981.68it/s]\n",
         
     | 
| 1095 | 
         
            +
                  "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n",
         
     | 
| 1096 | 
         
            +
                  "Saving model checkpoint to ./checkpoint-2000\n",
         
     | 
| 1097 | 
         
            +
                  "Configuration saved in ./checkpoint-2000/config.json\n",
         
     | 
| 1098 | 
         
            +
                  "Model weights saved in ./checkpoint-2000/pytorch_model.bin\n",
         
     | 
| 1099 | 
         
            +
                  "Feature extractor saved in ./checkpoint-2000/preprocessor_config.json\n",
         
     | 
| 1100 | 
         
            +
                  "tokenizer config file saved in ./checkpoint-2000/tokenizer_config.json\n",
         
     | 
| 1101 | 
         
            +
                  "Special tokens file saved in ./checkpoint-2000/special_tokens_map.json\n",
         
     | 
| 1102 | 
         
            +
                  "added tokens file saved in ./checkpoint-2000/added_tokens.json\n",
         
     | 
| 1103 | 
         
            +
                  "Feature extractor saved in ./preprocessor_config.json\n",
         
     | 
| 1104 | 
         
            +
                  "tokenizer config file saved in ./tokenizer_config.json\n",
         
     | 
| 1105 | 
         
            +
                  "Special tokens file saved in ./special_tokens_map.json\n",
         
     | 
| 1106 | 
         
            +
                  "added tokens file saved in ./added_tokens.json\n",
         
     | 
| 1107 | 
         
            +
                  "***** Running Evaluation *****\n",
         
     | 
| 1108 | 
         
            +
                  "  Num examples: Unknown\n",
         
     | 
| 1109 | 
         
            +
                  "  Batch size = 16\n",
         
     | 
| 1110 | 
         
            +
                  "Reading metadata...: 15520it [00:00, 72511.74it/s]\n",
         
     | 
| 1111 | 
         
            +
                  "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender. If input_length, segment, down_votes, age, up_votes, accent, locale, path, client_id, gender are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
         
     | 
| 1112 | 
         
            +
                 ]
         
     | 
| 1113 | 
         
            +
                }
         
     | 
| 1114 | 
         
            +
               ],
         
     | 
| 1115 | 
         
            +
               "source": [
         
     | 
| 1116 | 
         
            +
                "trainer.train()"
         
     | 
| 1117 | 
         
            +
               ]
         
     | 
| 1118 | 
         
            +
              },
         
     | 
| 1119 | 
         
            +
              {
         
     | 
| 1120 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1121 | 
         
            +
               "id": "747c6a6e",
         
     | 
| 1122 | 
         
            +
               "metadata": {
         
     | 
| 1123 | 
         
            +
                "pycharm": {
         
     | 
| 1124 | 
         
            +
                 "name": "#%% md\n"
         
     | 
| 1125 | 
         
            +
                }
         
     | 
| 1126 | 
         
            +
               },
         
     | 
| 1127 | 
         
            +
               "source": [
         
     | 
| 1128 | 
         
            +
                "(note that training may take some time to commence as we load the first training data samples with streaming mode)"
         
     | 
| 1129 | 
         
            +
               ]
         
     | 
| 1130 | 
         
            +
              },
         
     | 
| 1131 | 
         
            +
              {
         
     | 
| 1132 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1133 | 
         
            +
               "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3",
         
     | 
| 1134 | 
         
            +
               "metadata": {},
         
     | 
| 1135 | 
         
            +
               "source": [
         
     | 
| 1136 | 
         
            +
                "We can label our checkpoint with the `whisper-event` tag on push by setting the appropriate key-word arguments (kwargs):"
         
     | 
| 1137 | 
         
            +
               ]
         
     | 
| 1138 | 
         
            +
              },
         
     | 
| 1139 | 
         
            +
              {
         
     | 
| 1140 | 
         
            +
               "cell_type": "code",
         
     | 
| 1141 | 
         
            +
               "execution_count": null,
         
     | 
| 1142 | 
         
            +
               "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
         
     | 
| 1143 | 
         
            +
               "metadata": {},
         
     | 
| 1144 | 
         
            +
               "outputs": [],
         
     | 
| 1145 | 
         
            +
               "source": [
         
     | 
| 1146 | 
         
            +
                "kwargs = {\n",
         
     | 
| 1147 | 
         
            +
                "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
         
     | 
| 1148 | 
         
            +
                "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
         
     | 
| 1149 | 
         
            +
                "    \"language\": \"es\",\n",
         
     | 
| 1150 | 
         
            +
                "    \"model_name\": \"Whisper Small Es - Sanchit Gandhi\",  # a 'pretty' name for your model\n",
         
     | 
| 1151 | 
         
            +
                "    \"finetuned_from\": \"openai/whisper-small\",\n",
         
     | 
| 1152 | 
         
            +
                "    \"tasks\": \"automatic-speech-recognition\",\n",
         
     | 
| 1153 | 
         
            +
                "    \"tags\": \"whisper-event\",\n",
         
     | 
| 1154 | 
         
            +
                "}"
         
     | 
| 1155 | 
         
            +
               ]
         
     | 
| 1156 | 
         
            +
              },
         
     | 
| 1157 | 
         
            +
              {
         
     | 
| 1158 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1159 | 
         
            +
               "id": "090d676a-f944-4297-a938-a40eda0b2b68",
         
     | 
| 1160 | 
         
            +
               "metadata": {},
         
     | 
| 1161 | 
         
            +
               "source": [
         
     | 
| 1162 | 
         
            +
                "The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command:"
         
     | 
| 1163 | 
         
            +
               ]
         
     | 
| 1164 | 
         
            +
              },
         
     | 
| 1165 | 
         
            +
              {
         
     | 
| 1166 | 
         
            +
               "cell_type": "code",
         
     | 
| 1167 | 
         
            +
               "execution_count": null,
         
     | 
| 1168 | 
         
            +
               "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
         
     | 
| 1169 | 
         
            +
               "metadata": {},
         
     | 
| 1170 | 
         
            +
               "outputs": [],
         
     | 
| 1171 | 
         
            +
               "source": [
         
     | 
| 1172 | 
         
            +
                "trainer.push_to_hub(**kwargs)"
         
     | 
| 1173 | 
         
            +
               ]
         
     | 
| 1174 | 
         
            +
              },
         
     | 
| 1175 | 
         
            +
              {
         
     | 
| 1176 | 
         
            +
               "cell_type": "code",
         
     | 
| 1177 | 
         
            +
               "execution_count": null,
         
     | 
| 1178 | 
         
            +
               "id": "29e716f8-7386-4c8f-a35a-4f682ec24eb0",
         
     | 
| 1179 | 
         
            +
               "metadata": {},
         
     | 
| 1180 | 
         
            +
               "outputs": [],
         
     | 
| 1181 | 
         
            +
               "source": []
         
     | 
| 1182 | 
         
            +
              }
         
     | 
| 1183 | 
         
            +
             ],
         
     | 
| 1184 | 
         
            +
             "metadata": {
         
     | 
| 1185 | 
         
            +
              "kernelspec": {
         
     | 
| 1186 | 
         
            +
               "display_name": "hf_env",
         
     | 
| 1187 | 
         
            +
               "language": "python",
         
     | 
| 1188 | 
         
            +
               "name": "hf_env"
         
     | 
| 1189 | 
         
            +
              },
         
     | 
| 1190 | 
         
            +
              "language_info": {
         
     | 
| 1191 | 
         
            +
               "codemirror_mode": {
         
     | 
| 1192 | 
         
            +
                "name": "ipython",
         
     | 
| 1193 | 
         
            +
                "version": 3
         
     | 
| 1194 | 
         
            +
               },
         
     | 
| 1195 | 
         
            +
               "file_extension": ".py",
         
     | 
| 1196 | 
         
            +
               "mimetype": "text/x-python",
         
     | 
| 1197 | 
         
            +
               "name": "python",
         
     | 
| 1198 | 
         
            +
               "nbconvert_exporter": "python",
         
     | 
| 1199 | 
         
            +
               "pygments_lexer": "ipython3",
         
     | 
| 1200 | 
         
            +
               "version": "3.8.10"
         
     | 
| 1201 | 
         
            +
              }
         
     | 
| 1202 | 
         
            +
             },
         
     | 
| 1203 | 
         
            +
             "nbformat": 4,
         
     | 
| 1204 | 
         
            +
             "nbformat_minor": 5
         
     | 
| 1205 | 
         
            +
            }
         
     | 
    	
        added_tokens.json
    ADDED
    
    | 
         @@ -0,0 +1,109 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "<|af|>": 50327,
         
     | 
| 3 | 
         
            +
              "<|am|>": 50334,
         
     | 
| 4 | 
         
            +
              "<|ar|>": 50272,
         
     | 
| 5 | 
         
            +
              "<|as|>": 50350,
         
     | 
| 6 | 
         
            +
              "<|az|>": 50304,
         
     | 
| 7 | 
         
            +
              "<|ba|>": 50355,
         
     | 
| 8 | 
         
            +
              "<|be|>": 50330,
         
     | 
| 9 | 
         
            +
              "<|bg|>": 50292,
         
     | 
| 10 | 
         
            +
              "<|bn|>": 50302,
         
     | 
| 11 | 
         
            +
              "<|bo|>": 50347,
         
     | 
| 12 | 
         
            +
              "<|br|>": 50309,
         
     | 
| 13 | 
         
            +
              "<|bs|>": 50315,
         
     | 
| 14 | 
         
            +
              "<|ca|>": 50270,
         
     | 
| 15 | 
         
            +
              "<|cs|>": 50283,
         
     | 
| 16 | 
         
            +
              "<|cy|>": 50297,
         
     | 
| 17 | 
         
            +
              "<|da|>": 50285,
         
     | 
| 18 | 
         
            +
              "<|de|>": 50261,
         
     | 
| 19 | 
         
            +
              "<|el|>": 50281,
         
     | 
| 20 | 
         
            +
              "<|endoftext|>": 50257,
         
     | 
| 21 | 
         
            +
              "<|en|>": 50259,
         
     | 
| 22 | 
         
            +
              "<|es|>": 50262,
         
     | 
| 23 | 
         
            +
              "<|et|>": 50307,
         
     | 
| 24 | 
         
            +
              "<|eu|>": 50310,
         
     | 
| 25 | 
         
            +
              "<|fa|>": 50300,
         
     | 
| 26 | 
         
            +
              "<|fi|>": 50277,
         
     | 
| 27 | 
         
            +
              "<|fo|>": 50338,
         
     | 
| 28 | 
         
            +
              "<|fr|>": 50265,
         
     | 
| 29 | 
         
            +
              "<|gl|>": 50319,
         
     | 
| 30 | 
         
            +
              "<|gu|>": 50333,
         
     | 
| 31 | 
         
            +
              "<|haw|>": 50352,
         
     | 
| 32 | 
         
            +
              "<|ha|>": 50354,
         
     | 
| 33 | 
         
            +
              "<|hi|>": 50276,
         
     | 
| 34 | 
         
            +
              "<|hr|>": 50291,
         
     | 
| 35 | 
         
            +
              "<|ht|>": 50339,
         
     | 
| 36 | 
         
            +
              "<|hu|>": 50286,
         
     | 
| 37 | 
         
            +
              "<|hy|>": 50312,
         
     | 
| 38 | 
         
            +
              "<|id|>": 50275,
         
     | 
| 39 | 
         
            +
              "<|is|>": 50311,
         
     | 
| 40 | 
         
            +
              "<|it|>": 50274,
         
     | 
| 41 | 
         
            +
              "<|iw|>": 50279,
         
     | 
| 42 | 
         
            +
              "<|ja|>": 50266,
         
     | 
| 43 | 
         
            +
              "<|jw|>": 50356,
         
     | 
| 44 | 
         
            +
              "<|ka|>": 50329,
         
     | 
| 45 | 
         
            +
              "<|kk|>": 50316,
         
     | 
| 46 | 
         
            +
              "<|km|>": 50323,
         
     | 
| 47 | 
         
            +
              "<|kn|>": 50306,
         
     | 
| 48 | 
         
            +
              "<|ko|>": 50264,
         
     | 
| 49 | 
         
            +
              "<|la|>": 50294,
         
     | 
| 50 | 
         
            +
              "<|lb|>": 50345,
         
     | 
| 51 | 
         
            +
              "<|ln|>": 50353,
         
     | 
| 52 | 
         
            +
              "<|lo|>": 50336,
         
     | 
| 53 | 
         
            +
              "<|lt|>": 50293,
         
     | 
| 54 | 
         
            +
              "<|lv|>": 50301,
         
     | 
| 55 | 
         
            +
              "<|mg|>": 50349,
         
     | 
| 56 | 
         
            +
              "<|mi|>": 50295,
         
     | 
| 57 | 
         
            +
              "<|mk|>": 50308,
         
     | 
| 58 | 
         
            +
              "<|ml|>": 50296,
         
     | 
| 59 | 
         
            +
              "<|mn|>": 50314,
         
     | 
| 60 | 
         
            +
              "<|mr|>": 50320,
         
     | 
| 61 | 
         
            +
              "<|ms|>": 50282,
         
     | 
| 62 | 
         
            +
              "<|mt|>": 50343,
         
     | 
| 63 | 
         
            +
              "<|my|>": 50346,
         
     | 
| 64 | 
         
            +
              "<|ne|>": 50313,
         
     | 
| 65 | 
         
            +
              "<|nl|>": 50271,
         
     | 
| 66 | 
         
            +
              "<|nn|>": 50342,
         
     | 
| 67 | 
         
            +
              "<|nocaptions|>": 50362,
         
     | 
| 68 | 
         
            +
              "<|notimestamps|>": 50363,
         
     | 
| 69 | 
         
            +
              "<|no|>": 50288,
         
     | 
| 70 | 
         
            +
              "<|oc|>": 50328,
         
     | 
| 71 | 
         
            +
              "<|pa|>": 50321,
         
     | 
| 72 | 
         
            +
              "<|pl|>": 50269,
         
     | 
| 73 | 
         
            +
              "<|ps|>": 50340,
         
     | 
| 74 | 
         
            +
              "<|pt|>": 50267,
         
     | 
| 75 | 
         
            +
              "<|ro|>": 50284,
         
     | 
| 76 | 
         
            +
              "<|ru|>": 50263,
         
     | 
| 77 | 
         
            +
              "<|sa|>": 50344,
         
     | 
| 78 | 
         
            +
              "<|sd|>": 50332,
         
     | 
| 79 | 
         
            +
              "<|si|>": 50322,
         
     | 
| 80 | 
         
            +
              "<|sk|>": 50298,
         
     | 
| 81 | 
         
            +
              "<|sl|>": 50305,
         
     | 
| 82 | 
         
            +
              "<|sn|>": 50324,
         
     | 
| 83 | 
         
            +
              "<|so|>": 50326,
         
     | 
| 84 | 
         
            +
              "<|sq|>": 50317,
         
     | 
| 85 | 
         
            +
              "<|sr|>": 50303,
         
     | 
| 86 | 
         
            +
              "<|startoflm|>": 50360,
         
     | 
| 87 | 
         
            +
              "<|startofprev|>": 50361,
         
     | 
| 88 | 
         
            +
              "<|startoftranscript|>": 50258,
         
     | 
| 89 | 
         
            +
              "<|su|>": 50357,
         
     | 
| 90 | 
         
            +
              "<|sv|>": 50273,
         
     | 
| 91 | 
         
            +
              "<|sw|>": 50318,
         
     | 
| 92 | 
         
            +
              "<|ta|>": 50287,
         
     | 
| 93 | 
         
            +
              "<|te|>": 50299,
         
     | 
| 94 | 
         
            +
              "<|tg|>": 50331,
         
     | 
| 95 | 
         
            +
              "<|th|>": 50289,
         
     | 
| 96 | 
         
            +
              "<|tk|>": 50341,
         
     | 
| 97 | 
         
            +
              "<|tl|>": 50348,
         
     | 
| 98 | 
         
            +
              "<|transcribe|>": 50359,
         
     | 
| 99 | 
         
            +
              "<|translate|>": 50358,
         
     | 
| 100 | 
         
            +
              "<|tr|>": 50268,
         
     | 
| 101 | 
         
            +
              "<|tt|>": 50351,
         
     | 
| 102 | 
         
            +
              "<|uk|>": 50280,
         
     | 
| 103 | 
         
            +
              "<|ur|>": 50290,
         
     | 
| 104 | 
         
            +
              "<|uz|>": 50337,
         
     | 
| 105 | 
         
            +
              "<|vi|>": 50278,
         
     | 
| 106 | 
         
            +
              "<|yi|>": 50335,
         
     | 
| 107 | 
         
            +
              "<|yo|>": 50325,
         
     | 
| 108 | 
         
            +
              "<|zh|>": 50260
         
     | 
| 109 | 
         
            +
            }
         
     | 
    	
        config.json
    ADDED
    
    | 
         @@ -0,0 +1,42 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "_name_or_path": "juancopi81/whisper-medium-es-common-fleurs",
         
     | 
| 3 | 
         
            +
              "activation_dropout": 0.0,
         
     | 
| 4 | 
         
            +
              "activation_function": "gelu",
         
     | 
| 5 | 
         
            +
              "architectures": [
         
     | 
| 6 | 
         
            +
                "WhisperForConditionalGeneration"
         
     | 
| 7 | 
         
            +
              ],
         
     | 
| 8 | 
         
            +
              "attention_dropout": 0.0,
         
     | 
| 9 | 
         
            +
              "begin_suppress_tokens": [
         
     | 
| 10 | 
         
            +
                220,
         
     | 
| 11 | 
         
            +
                50257
         
     | 
| 12 | 
         
            +
              ],
         
     | 
| 13 | 
         
            +
              "bos_token_id": 50257,
         
     | 
| 14 | 
         
            +
              "d_model": 1024,
         
     | 
| 15 | 
         
            +
              "decoder_attention_heads": 16,
         
     | 
| 16 | 
         
            +
              "decoder_ffn_dim": 4096,
         
     | 
| 17 | 
         
            +
              "decoder_layerdrop": 0.0,
         
     | 
| 18 | 
         
            +
              "decoder_layers": 24,
         
     | 
| 19 | 
         
            +
              "decoder_start_token_id": 50258,
         
     | 
| 20 | 
         
            +
              "dropout": 0.1,
         
     | 
| 21 | 
         
            +
              "encoder_attention_heads": 16,
         
     | 
| 22 | 
         
            +
              "encoder_ffn_dim": 4096,
         
     | 
| 23 | 
         
            +
              "encoder_layerdrop": 0.0,
         
     | 
| 24 | 
         
            +
              "encoder_layers": 24,
         
     | 
| 25 | 
         
            +
              "eos_token_id": 50257,
         
     | 
| 26 | 
         
            +
              "forced_decoder_ids": null,
         
     | 
| 27 | 
         
            +
              "init_std": 0.02,
         
     | 
| 28 | 
         
            +
              "is_encoder_decoder": true,
         
     | 
| 29 | 
         
            +
              "max_length": 448,
         
     | 
| 30 | 
         
            +
              "max_source_positions": 1500,
         
     | 
| 31 | 
         
            +
              "max_target_positions": 448,
         
     | 
| 32 | 
         
            +
              "model_type": "whisper",
         
     | 
| 33 | 
         
            +
              "num_hidden_layers": 24,
         
     | 
| 34 | 
         
            +
              "num_mel_bins": 80,
         
     | 
| 35 | 
         
            +
              "pad_token_id": 50257,
         
     | 
| 36 | 
         
            +
              "scale_embedding": false,
         
     | 
| 37 | 
         
            +
              "suppress_tokens": [],
         
     | 
| 38 | 
         
            +
              "torch_dtype": "float32",
         
     | 
| 39 | 
         
            +
              "transformers_version": "4.26.0.dev0",
         
     | 
| 40 | 
         
            +
              "use_cache": false,
         
     | 
| 41 | 
         
            +
              "vocab_size": 51865
         
     | 
| 42 | 
         
            +
            }
         
     | 
    	
        fine-tune-whisper-streaming.ipynb
    ADDED
    
    | 
         @@ -0,0 +1,1287 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
             "cells": [
         
     | 
| 3 | 
         
            +
              {
         
     | 
| 4 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 5 | 
         
            +
               "id": "75b58048-7d14-4fc6-8085-1fc08c81b4a6",
         
     | 
| 6 | 
         
            +
               "metadata": {},
         
     | 
| 7 | 
         
            +
               "source": [
         
     | 
| 8 | 
         
            +
                "# Fine-Tune Whisper With 🤗 Transformers and Streaming Mode"
         
     | 
| 9 | 
         
            +
               ]
         
     | 
| 10 | 
         
            +
              },
         
     | 
| 11 | 
         
            +
              {
         
     | 
| 12 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 13 | 
         
            +
               "id": "fbfa8ad5-4cdc-4512-9058-836cbbf65e1a",
         
     | 
| 14 | 
         
            +
               "metadata": {},
         
     | 
| 15 | 
         
            +
               "source": [
         
     | 
| 16 | 
         
            +
                "In this Colab, we present a step-by-step guide on fine-tuning Whisper with Hugging Face 🤗 Transformers on 400 hours of speech data! Using streaming mode, we'll show how you can train a speech recongition model on any dataset, irrespective of size. With streaming mode, storage requirements are no longer a consideration: you can train a model on whatever dataset you want, even if it's download size exceeds your devices disk space. How can this be possible? It simply seems too good to be true! Well, rest assured it's not 😉 Carry on reading to find out more."
         
     | 
| 17 | 
         
            +
               ]
         
     | 
| 18 | 
         
            +
              },
         
     | 
| 19 | 
         
            +
              {
         
     | 
| 20 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 21 | 
         
            +
               "id": "afe0d503-ae4e-4aa7-9af4-dbcba52db41e",
         
     | 
| 22 | 
         
            +
               "metadata": {},
         
     | 
| 23 | 
         
            +
               "source": [
         
     | 
| 24 | 
         
            +
                "## Introduction"
         
     | 
| 25 | 
         
            +
               ]
         
     | 
| 26 | 
         
            +
              },
         
     | 
| 27 | 
         
            +
              {
         
     | 
| 28 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 29 | 
         
            +
               "id": "9ae91ed4-9c3e-4ade-938e-f4c2dcfbfdc0",
         
     | 
| 30 | 
         
            +
               "metadata": {},
         
     | 
| 31 | 
         
            +
               "source": [
         
     | 
| 32 | 
         
            +
                "Speech recognition datasets are large. A typical speech dataset consists of approximately 100 hours of audio-transcription data, requiring upwards of 130GB of storage space for download and preparation. For most ASR researchers, this is already at the upper limit of what is feasible for disk space. So what happens when we want to train on a larger dataset? The full [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) dataset consists of 960 hours of audio data. Kensho's [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) contains 5,000 hours of audio data. ML Commons [People's Speech](https://huggingface.co/datasets/MLCommons/peoples_speech) contains **30,000+** hours of audio data! Do we need to bite the bullet and buy additional storage? Or is there a way we can train on all of these datasets with no disk drive requirements?\n",
         
     | 
| 33 | 
         
            +
                "\n",
         
     | 
| 34 | 
         
            +
                "When training machine learning systems, we rarely use the entire dataset at once. We typically _batch_ our data into smaller subsets of data, and pass these incrementally through our training pipeline. This is because we train our system on an accelerator device, such as a GPU or TPU, which has a memory limit typically around 16GB. We have to fit our model, optimiser and training data all on the same accelerator device, so we usually have to divide the dataset up into smaller batches and move them from the CPU to the GPU when required.\n",
         
     | 
| 35 | 
         
            +
                "\n",
         
     | 
| 36 | 
         
            +
                "Consequently, we don't require the entire dataset to be downloaded at once; we simply need the batch of data that we pass to our model at any one go. We can leverage this principle of partial dataset loading when preparing our dataset: rather than downloading the entire dataset at the start, we can load each piece of data as and when we need it. For each batch, we load the relevant data from a remote server and pass it through the training pipeline. For the next batch, we load the next items and again pass them through the training pipeline. At no point do we have to save data to our disk drive, we simply load them in memory and use them in our pipeline. In doing so, we only ever need as much memory as each individual batch requires.\n",
         
     | 
| 37 | 
         
            +
                "\n",
         
     | 
| 38 | 
         
            +
                "This is analogous to downloading a TV show versus streaming it 📺 When we download a TV show, we download the entire video offline and save it to our disk. Compare this to when we stream a TV show. Here, we don't download any part of the video to memory, but iterate over the video file and load each part in real-time as required. It's this same principle that we can apply to our ML training pipeline! We want to iterate over the dataset and load each sample of data as required.\n",
         
     | 
| 39 | 
         
            +
                "\n",
         
     | 
| 40 | 
         
            +
                "While the principle of partial dataset loading sounds ideal, it also seems **pretty** difficult to do. Luckily for us, 🤗 Datasets allows us to do this with minimal code changes! We'll make use of the principle of [_streaming_](https://huggingface.co/docs/datasets/stream), depicted graphically in Figure 1. Streaming does exactly this: the data is loaded progressively as we iterate over the dataset, meaning it is only loaded as and when we need it. If you're familiar with 🤗 Transformers and Datasets, the content of this notebook will be very familiar, with some small extensions to support streaming mode."
         
     | 
| 41 | 
         
            +
               ]
         
     | 
| 42 | 
         
            +
              },
         
     | 
| 43 | 
         
            +
              {
         
     | 
| 44 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 45 | 
         
            +
               "id": "1c87f76e-47be-4a5d-bc52-7b1c2e9d4f5a",
         
     | 
| 46 | 
         
            +
               "metadata": {},
         
     | 
| 47 | 
         
            +
               "source": [
         
     | 
| 48 | 
         
            +
                "<figure>\n",
         
     | 
| 49 | 
         
            +
                "<img src=\"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/streaming.gif\" alt=\"Trulli\" style=\"width:100%\">\n",
         
     | 
| 50 | 
         
            +
                "<figcaption align = \"center\"><b>Figure 1:</b> Streaming mode. The dataset is divided into smaller subsets, with subsets loaded progressively as we iterate over the dataset. </figcaption>\n",
         
     | 
| 51 | 
         
            +
                "</figure>"
         
     | 
| 52 | 
         
            +
               ]
         
     | 
| 53 | 
         
            +
              },
         
     | 
| 54 | 
         
            +
              {
         
     | 
| 55 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 56 | 
         
            +
               "id": "21b6316e-8a55-4549-a154-66d3da2ab74a",
         
     | 
| 57 | 
         
            +
               "metadata": {},
         
     | 
| 58 | 
         
            +
               "source": [
         
     | 
| 59 | 
         
            +
                "This notebook provides a guide to fine-tuning on the task of _speech recognition_, which involves learning a\n",
         
     | 
| 60 | 
         
            +
                "mapping from speech to text. Speech recognition is divided into two categories: English-only or multilingual (all other languages). \n",
         
     | 
| 61 | 
         
            +
                "This notebook applies to both categories, with pointers for changing between languages and datasets.\n",
         
     | 
| 62 | 
         
            +
                "\n",
         
     | 
| 63 | 
         
            +
                "As for our model, we'll fine-tune the Whisper model released in [September 2022](https://openai.com/blog/whisper/) by the authors \n",
         
     | 
| 64 | 
         
            +
                "Alec Radford et al. from OpenAI. Whisper is an encoder-decoder model pre-trained on 680k hours of labelled audio-transcription data. \n",
         
     | 
| 65 | 
         
            +
                "It achieves strong performance on many speech recognition and speech translation datasets without fine-tuning. With fine-tuning, \n",
         
     | 
| 66 | 
         
            +
                "we aim to improve upon these results further, with many SoTA results up for grabs! For a full explanation on the Whisper model, the \n",
         
     | 
| 67 | 
         
            +
                "reader is advised to read the blog post [Fine-Tune Whisper with 🤗 Transformers](https://huggingface.co/blog/fine-tune-whisper#introduction).\n",
         
     | 
| 68 | 
         
            +
                "\n",
         
     | 
| 69 | 
         
            +
                "The Whisper checkpoints come in five configurations of varying model sizes.\n",
         
     | 
| 70 | 
         
            +
                "The smallest four are trained on either English-only or multilingual data.\n",
         
     | 
| 71 | 
         
            +
                "The largest checkpoint is multilingual only. All nine of the pre-trained checkpoints \n",
         
     | 
| 72 | 
         
            +
                "are available on the [Hugging Face Hub](https://huggingface.co/models?search=openai/whisper). The \n",
         
     | 
| 73 | 
         
            +
                "checkpoints are summarised in the following table with links to the models on the Hub:\n",
         
     | 
| 74 | 
         
            +
                "\n",
         
     | 
| 75 | 
         
            +
                "| Size   | Layers | Width | Heads | Parameters | English-only                                         | Multilingual                                      |\n",
         
     | 
| 76 | 
         
            +
                "|--------|--------|-------|-------|------------|------------------------------------------------------|---------------------------------------------------|\n",
         
     | 
| 77 | 
         
            +
                "| tiny   | 4      | 384   | 6     | 39 M       | [✓](https://huggingface.co/openai/whisper-tiny.en)   | [✓](https://huggingface.co/openai/whisper-tiny.)  |\n",
         
     | 
| 78 | 
         
            +
                "| base   | 6      | 512   | 8     | 74 M       | [✓](https://huggingface.co/openai/whisper-base.en)   | [✓](https://huggingface.co/openai/whisper-base)   |\n",
         
     | 
| 79 | 
         
            +
                "| small  | 12     | 768   | 12    | 244 M      | [✓](https://huggingface.co/openai/whisper-small.en)  | [✓](https://huggingface.co/openai/whisper-small)  |\n",
         
     | 
| 80 | 
         
            +
                "| medium | 24     | 1024  | 16    | 769 M      | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) |\n",
         
     | 
| 81 | 
         
            +
                "| large  | 32     | 1280  | 20    | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large)  |\n",
         
     | 
| 82 | 
         
            +
                "\n",
         
     | 
| 83 | 
         
            +
                "When fine-tuning on an English dataset for speech recognition, it is recommeneded to select one of the English-only checkpoints. For any other language, it is recommended to select a multilingual checkpoint.\n",
         
     | 
| 84 | 
         
            +
                "\n",
         
     | 
| 85 | 
         
            +
                "For demonstration purposes, we'll fine-tune the multilingual version of the \n",
         
     | 
| 86 | 
         
            +
                "[`\"small\"`](https://huggingface.co/openai/whisper-small) checkpoint with 244M params (~= 1GB). \n",
         
     | 
| 87 | 
         
            +
                "As for our data, we'll train and evaluate our system on 400 hours of multilingual speech recognition data\n",
         
     | 
| 88 | 
         
            +
                "taken from the [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)\n",
         
     | 
| 89 | 
         
            +
                "dataset. We'll show how we can train a model on 400 hours of training data using the default disk space \n",
         
     | 
| 90 | 
         
            +
                "that comes with a standard GPU device or Google Colab."
         
     | 
| 91 | 
         
            +
               ]
         
     | 
| 92 | 
         
            +
              },
         
     | 
| 93 | 
         
            +
              {
         
     | 
| 94 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 95 | 
         
            +
               "id": "b219c9dd-39b6-4a95-b2a1-3f547a1e7bc0",
         
     | 
| 96 | 
         
            +
               "metadata": {},
         
     | 
| 97 | 
         
            +
               "source": [
         
     | 
| 98 | 
         
            +
                "## Load Dataset with Streaming"
         
     | 
| 99 | 
         
            +
               ]
         
     | 
| 100 | 
         
            +
              },
         
     | 
| 101 | 
         
            +
              {
         
     | 
| 102 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 103 | 
         
            +
               "id": "b17a4763-4381-4157-ae38-b04a8b5f1c43",
         
     | 
| 104 | 
         
            +
               "metadata": {},
         
     | 
| 105 | 
         
            +
               "source": [
         
     | 
| 106 | 
         
            +
                "This is where the magic happens! We'll first write a wrapper function around 🤗 Datasets `load_dataset` method. This function downloads the required splits using streaming mode by forcing `streaming=True` in the `load_dataset` method. Multiple splits can be combined (interleaved) by concatenating them with the \"+\" symbol when specifying the split name, e.g. `split=train+validation` will return a single split with the training and validation splits interleaved together. The function has the same arguments and key-word arguments as 🤗 Datasets `load_dataset` method, so we can use it in exactly the same way!"
         
     | 
| 107 | 
         
            +
               ]
         
     | 
| 108 | 
         
            +
              },
         
     | 
| 109 | 
         
            +
              {
         
     | 
| 110 | 
         
            +
               "cell_type": "code",
         
     | 
| 111 | 
         
            +
               "execution_count": 1,
         
     | 
| 112 | 
         
            +
               "id": "065a8cf7-e54f-4ac3-900e-609c80714fca",
         
     | 
| 113 | 
         
            +
               "metadata": {},
         
     | 
| 114 | 
         
            +
               "outputs": [],
         
     | 
| 115 | 
         
            +
               "source": [
         
     | 
| 116 | 
         
            +
                "from datasets import interleave_datasets, load_dataset\n",
         
     | 
| 117 | 
         
            +
                "\n",
         
     | 
| 118 | 
         
            +
                "def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):\n",
         
     | 
| 119 | 
         
            +
                "    if \"+\" in split:\n",
         
     | 
| 120 | 
         
            +
                "        # load multiple splits separated by the `+` symbol *with* streaming mode\n",
         
     | 
| 121 | 
         
            +
                "        dataset_splits = [load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs) for split_name in split.split(\"+\")]\n",
         
     | 
| 122 | 
         
            +
                "        # interleave multiple splits to form one dataset\n",
         
     | 
| 123 | 
         
            +
                "        interleaved_dataset = interleave_datasets(dataset_splits)\n",
         
     | 
| 124 | 
         
            +
                "        return interleaved_dataset\n",
         
     | 
| 125 | 
         
            +
                "    else:\n",
         
     | 
| 126 | 
         
            +
                "        # load a single split *with* streaming mode\n",
         
     | 
| 127 | 
         
            +
                "        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)\n",
         
     | 
| 128 | 
         
            +
                "        return dataset"
         
     | 
| 129 | 
         
            +
               ]
         
     | 
| 130 | 
         
            +
              },
         
     | 
| 131 | 
         
            +
              {
         
     | 
| 132 | 
         
            +
               "cell_type": "code",
         
     | 
| 133 | 
         
            +
               "execution_count": 2,
         
     | 
| 134 | 
         
            +
               "id": "ed0df0dc-8c2a-47c9-b105-49d61aec9890",
         
     | 
| 135 | 
         
            +
               "metadata": {},
         
     | 
| 136 | 
         
            +
               "outputs": [],
         
     | 
| 137 | 
         
            +
               "source": [
         
     | 
| 138 | 
         
            +
                "from datasets import Audio, interleave_datasets, IterableDataset, load_dataset\n",
         
     | 
| 139 | 
         
            +
                "from typing import List, Optional"
         
     | 
| 140 | 
         
            +
               ]
         
     | 
| 141 | 
         
            +
              },
         
     | 
| 142 | 
         
            +
              {
         
     | 
| 143 | 
         
            +
               "cell_type": "code",
         
     | 
| 144 | 
         
            +
               "execution_count": 3,
         
     | 
| 145 | 
         
            +
               "id": "fa07e8c0-1874-43e7-8eec-fac124d0cdfe",
         
     | 
| 146 | 
         
            +
               "metadata": {},
         
     | 
| 147 | 
         
            +
               "outputs": [],
         
     | 
| 148 | 
         
            +
               "source": [
         
     | 
| 149 | 
         
            +
                "dataset_names = [\"mozilla-foundation/common_voice_11_0\", \"google/fleurs\"]\n",
         
     | 
| 150 | 
         
            +
                "dataset_config_names = [\"es\", \"es_419\"]\n",
         
     | 
| 151 | 
         
            +
                "text_column_names = [\"sentence\", \"raw_transcription\"]"
         
     | 
| 152 | 
         
            +
               ]
         
     | 
| 153 | 
         
            +
              },
         
     | 
| 154 | 
         
            +
              {
         
     | 
| 155 | 
         
            +
               "cell_type": "code",
         
     | 
| 156 | 
         
            +
               "execution_count": 4,
         
     | 
| 157 | 
         
            +
               "id": "88a7949b-60e2-4269-94da-e18d24dc3788",
         
     | 
| 158 | 
         
            +
               "metadata": {},
         
     | 
| 159 | 
         
            +
               "outputs": [],
         
     | 
| 160 | 
         
            +
               "source": [
         
     | 
| 161 | 
         
            +
                "def load_multiple_streaming_datasets(\n",
         
     | 
| 162 | 
         
            +
                "    dataset_names: List,\n",
         
     | 
| 163 | 
         
            +
                "    dataset_config_names: List,\n",
         
     | 
| 164 | 
         
            +
                "    splits: Optional[List] = None,\n",
         
     | 
| 165 | 
         
            +
                "    text_column_names: Optional[List] = None,\n",
         
     | 
| 166 | 
         
            +
                "    sampling_rate: Optional[int] = 16000,\n",
         
     | 
| 167 | 
         
            +
                "    stopping_strategy: Optional[str] = \"all_exhausted\",\n",
         
     | 
| 168 | 
         
            +
                "    **kwargs\n",
         
     | 
| 169 | 
         
            +
                ") -> IterableDataset:\n",
         
     | 
| 170 | 
         
            +
                "\n",
         
     | 
| 171 | 
         
            +
                "    if len(dataset_names) != len(dataset_config_names):\n",
         
     | 
| 172 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 173 | 
         
            +
                "            f\"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
         
     | 
| 174 | 
         
            +
                "            f\" {len(dataset_config_names)} configs.\"\n",
         
     | 
| 175 | 
         
            +
                "        )\n",
         
     | 
| 176 | 
         
            +
                "\n",
         
     | 
| 177 | 
         
            +
                "    if splits is not None and len(splits) != len(dataset_names):\n",
         
     | 
| 178 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 179 | 
         
            +
                "            f\"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits.\"\n",
         
     | 
| 180 | 
         
            +
                "        )\n",
         
     | 
| 181 | 
         
            +
                "\n",
         
     | 
| 182 | 
         
            +
                "    if text_column_names is not None and len(text_column_names) != len(dataset_names):\n",
         
     | 
| 183 | 
         
            +
                "        raise ValueError(\n",
         
     | 
| 184 | 
         
            +
                "            f\"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and\"\n",
         
     | 
| 185 | 
         
            +
                "            f\" {len(text_column_names)} text column names.\"\n",
         
     | 
| 186 | 
         
            +
                "        )\n",
         
     | 
| 187 | 
         
            +
                "\n",
         
     | 
| 188 | 
         
            +
                "    splits = splits if splits is not None else [\"train\" for i in range(len(dataset_names))]\n",
         
     | 
| 189 | 
         
            +
                "    text_column_names = (\n",
         
     | 
| 190 | 
         
            +
                "        text_column_names if text_column_names is not None else [\"text\" for i in range(len(dataset_names))]\n",
         
     | 
| 191 | 
         
            +
                "    )\n",
         
     | 
| 192 | 
         
            +
                "\n",
         
     | 
| 193 | 
         
            +
                "    all_datasets = []\n",
         
     | 
| 194 | 
         
            +
                "    # iterate over the datasets we want to interleave\n",
         
     | 
| 195 | 
         
            +
                "    for i, dataset_name in enumerate(dataset_names):\n",
         
     | 
| 196 | 
         
            +
                "        dataset = load_dataset(dataset_name, dataset_config_names[i], split=splits[i], streaming=True, **kwargs)\n",
         
     | 
| 197 | 
         
            +
                "        # resample to specified sampling rate\n",
         
     | 
| 198 | 
         
            +
                "        dataset = dataset.cast_column(\"audio\", Audio(sampling_rate))\n",
         
     | 
| 199 | 
         
            +
                "        #  normalise columns to [\"audio\", \"sentence\"]\n",
         
     | 
| 200 | 
         
            +
                "        if text_column_names[i] != \"sentence\":\n",
         
     | 
| 201 | 
         
            +
                "            dataset = dataset.rename_column(text_column_names[i], \"sentence\")\n",
         
     | 
| 202 | 
         
            +
                "        dataset = dataset.remove_columns(set(dataset.features.keys()) - set([\"audio\", \"sentence\"]))\n",
         
     | 
| 203 | 
         
            +
                "        all_datasets.append(dataset)\n",
         
     | 
| 204 | 
         
            +
                "\n",
         
     | 
| 205 | 
         
            +
                "    interleaved_dataset = interleave_datasets(all_datasets, stopping_strategy=stopping_strategy)\n",
         
     | 
| 206 | 
         
            +
                "    return interleaved_dataset"
         
     | 
| 207 | 
         
            +
               ]
         
     | 
| 208 | 
         
            +
              },
         
     | 
| 209 | 
         
            +
              {
         
     | 
| 210 | 
         
            +
               "cell_type": "code",
         
     | 
| 211 | 
         
            +
               "execution_count": 5,
         
     | 
| 212 | 
         
            +
               "id": "1f3e756f-f55f-4077-951f-6d04930bf5d2",
         
     | 
| 213 | 
         
            +
               "metadata": {},
         
     | 
| 214 | 
         
            +
               "outputs": [],
         
     | 
| 215 | 
         
            +
               "source": [
         
     | 
| 216 | 
         
            +
                "traind_ds = load_multiple_streaming_datasets(dataset_names, dataset_config_names=dataset_config_names, text_column_names=text_column_names, use_auth_token=True)"
         
     | 
| 217 | 
         
            +
               ]
         
     | 
| 218 | 
         
            +
              },
         
     | 
| 219 | 
         
            +
              {
         
     | 
| 220 | 
         
            +
               "cell_type": "code",
         
     | 
| 221 | 
         
            +
               "execution_count": 6,
         
     | 
| 222 | 
         
            +
               "id": "0155ff1a-8a3e-406b-8700-dcaafd9535cf",
         
     | 
| 223 | 
         
            +
               "metadata": {},
         
     | 
| 224 | 
         
            +
               "outputs": [
         
     | 
| 225 | 
         
            +
                {
         
     | 
| 226 | 
         
            +
                 "name": "stderr",
         
     | 
| 227 | 
         
            +
                 "output_type": "stream",
         
     | 
| 228 | 
         
            +
                 "text": [
         
     | 
| 229 | 
         
            +
                  "Reading metadata...: 230467it [00:05, 42062.14it/s]\n"
         
     | 
| 230 | 
         
            +
                 ]
         
     | 
| 231 | 
         
            +
                },
         
     | 
| 232 | 
         
            +
                {
         
     | 
| 233 | 
         
            +
                 "name": "stdout",
         
     | 
| 234 | 
         
            +
                 "output_type": "stream",
         
     | 
| 235 | 
         
            +
                 "text": [
         
     | 
| 236 | 
         
            +
                  "0 ¿ Qué tal a tres de cinco ?\n",
         
     | 
| 237 | 
         
            +
                  "1 El uso de Internet y de la red informática mundial permite que los estudiantes tengan acceso a la información en todo momento.\n",
         
     | 
| 238 | 
         
            +
                  "2 vamos , quiero decir , que no soy de citas especiales .\n",
         
     | 
| 239 | 
         
            +
                  "3 Los deportes de nieve en descenso, como el esquí y la tablanieve, son disciplinas populares que consisten en deslizarse con esquís o una tabla fijada a los pies, sobre un terreno nevado.\n",
         
     | 
| 240 | 
         
            +
                  "4 fray Lope , en aquel momento , colmaba otro vaso igual :\n",
         
     | 
| 241 | 
         
            +
                  "5 El título de «capital de la moda» fue traspasado, a partir de entonces, de Constantinopla a París.\n",
         
     | 
| 242 | 
         
            +
                  "6 hermanito . dice hermanito . anda ...\n",
         
     | 
| 243 | 
         
            +
                  "7 Diez años después, estuvo al mando de la participación soviética en la misión Apollo-Soyuz, que simbolizaba el fin de la Carrera Espacial.\n",
         
     | 
| 244 | 
         
            +
                  "8 tengo un mensaje para usted\n",
         
     | 
| 245 | 
         
            +
                  "9 Haga averiguaciones en la institución y en la oficina de inmigración del destino donde pretende estudiar, a fin de tomar conocimiento en detalle los requisitos.\n"
         
     | 
| 246 | 
         
            +
                 ]
         
     | 
| 247 | 
         
            +
                }
         
     | 
| 248 | 
         
            +
               ],
         
     | 
| 249 | 
         
            +
               "source": [
         
     | 
| 250 | 
         
            +
                "for i, sample in enumerate(traind_ds):\n",
         
     | 
| 251 | 
         
            +
                "    print(i, sample[\"sentence\"])\n",
         
     | 
| 252 | 
         
            +
                "    if i == 9:\n",
         
     | 
| 253 | 
         
            +
                "        break"
         
     | 
| 254 | 
         
            +
               ]
         
     | 
| 255 | 
         
            +
              },
         
     | 
| 256 | 
         
            +
              {
         
     | 
| 257 | 
         
            +
               "cell_type": "code",
         
     | 
| 258 | 
         
            +
               "execution_count": 7,
         
     | 
| 259 | 
         
            +
               "id": "d8be8403-334f-4485-aff0-55f2a3cc3680",
         
     | 
| 260 | 
         
            +
               "metadata": {},
         
     | 
| 261 | 
         
            +
               "outputs": [
         
     | 
| 262 | 
         
            +
                {
         
     | 
| 263 | 
         
            +
                 "data": {
         
     | 
| 264 | 
         
            +
                  "text/plain": [
         
     | 
| 265 | 
         
            +
                   "{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),\n",
         
     | 
| 266 | 
         
            +
                   " 'sentence': Value(dtype='string', id=None)}"
         
     | 
| 267 | 
         
            +
                  ]
         
     | 
| 268 | 
         
            +
                 },
         
     | 
| 269 | 
         
            +
                 "execution_count": 7,
         
     | 
| 270 | 
         
            +
                 "metadata": {},
         
     | 
| 271 | 
         
            +
                 "output_type": "execute_result"
         
     | 
| 272 | 
         
            +
                }
         
     | 
| 273 | 
         
            +
               ],
         
     | 
| 274 | 
         
            +
               "source": [
         
     | 
| 275 | 
         
            +
                "traind_ds.features"
         
     | 
| 276 | 
         
            +
               ]
         
     | 
| 277 | 
         
            +
              },
         
     | 
| 278 | 
         
            +
              {
         
     | 
| 279 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 280 | 
         
            +
               "id": "674429c5-0ab4-4adf-975b-621bb69eca38",
         
     | 
| 281 | 
         
            +
               "metadata": {},
         
     | 
| 282 | 
         
            +
               "source": [
         
     | 
| 283 | 
         
            +
                "We'll train our system on the Spanish split of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). We can see how much training data we have by viewing the [language page](https://commonvoice.mozilla.org/en/datasets) on the Common Voice website. The Spanish split has over 400 hours of labelled training data - that's enourmous! More than we could ever fit on a Google Colab or a standard workstation. But with streaming mode, we'll only download data as and when we need it, making training on this dataset possible!\n",
         
     | 
| 284 | 
         
            +
                "\n",
         
     | 
| 285 | 
         
            +
                "Since Spanish is relatively high-resource, we'll only use the `train` split for training and the `test` split for evaluation. If you're training on a low-resource language, such as the Hindi split of Common Voice 11, it's worth combining the `train` and `validation` splits to give a larger training set. You can achieve this by setting: `split=\"train+validation\"` for the training split.\n",
         
     | 
| 286 | 
         
            +
                "\n",
         
     | 
| 287 | 
         
            +
                "If you're using a gated dataset, like Common Voice 11, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to load the data locally."
         
     | 
| 288 | 
         
            +
               ]
         
     | 
| 289 | 
         
            +
              },
         
     | 
| 290 | 
         
            +
              {
         
     | 
| 291 | 
         
            +
               "cell_type": "code",
         
     | 
| 292 | 
         
            +
               "execution_count": 8,
         
     | 
| 293 | 
         
            +
               "id": "a2787582-554f-44ce-9f38-4180a5ed6b44",
         
     | 
| 294 | 
         
            +
               "metadata": {},
         
     | 
| 295 | 
         
            +
               "outputs": [],
         
     | 
| 296 | 
         
            +
               "source": [
         
     | 
| 297 | 
         
            +
                "from datasets import IterableDatasetDict\n",
         
     | 
| 298 | 
         
            +
                "\n",
         
     | 
| 299 | 
         
            +
                "raw_datasets = IterableDatasetDict()\n",
         
     | 
| 300 | 
         
            +
                "\n",
         
     | 
| 301 | 
         
            +
                "raw_datasets[\"train\"] = traind_ds  # set split=\"train+validation\" for low-resource\n",
         
     | 
| 302 | 
         
            +
                "raw_datasets[\"test\"] = load_streaming_dataset(\"mozilla-foundation/common_voice_11_0\", \"es\", split=\"test\", use_auth_token=True)\n",
         
     | 
| 303 | 
         
            +
                "# raw_datasets[\"train\"] = traind_ds"
         
     | 
| 304 | 
         
            +
               ]
         
     | 
| 305 | 
         
            +
              },
         
     | 
| 306 | 
         
            +
              {
         
     | 
| 307 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 308 | 
         
            +
               "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605",
         
     | 
| 309 | 
         
            +
               "metadata": {},
         
     | 
| 310 | 
         
            +
               "source": [
         
     | 
| 311 | 
         
            +
                "## Prepare Processor and Pre-Process Data"
         
     | 
| 312 | 
         
            +
               ]
         
     | 
| 313 | 
         
            +
              },
         
     | 
| 314 | 
         
            +
              {
         
     | 
| 315 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 316 | 
         
            +
               "id": "601c3099-1026-439e-93e2-5635b3ba5a73",
         
     | 
| 317 | 
         
            +
               "metadata": {},
         
     | 
| 318 | 
         
            +
               "source": [
         
     | 
| 319 | 
         
            +
                "The ASR pipeline can be de-composed into three stages: \n",
         
     | 
| 320 | 
         
            +
                "1) A feature extractor which pre-processes the raw audio-inputs\n",
         
     | 
| 321 | 
         
            +
                "2) The model which performs the sequence-to-sequence mapping \n",
         
     | 
| 322 | 
         
            +
                "3) A tokenizer which post-processes the model outputs to text format\n",
         
     | 
| 323 | 
         
            +
                "\n",
         
     | 
| 324 | 
         
            +
                "In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, \n",
         
     | 
| 325 | 
         
            +
                "called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor)\n",
         
     | 
| 326 | 
         
            +
                "and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) \n",
         
     | 
| 327 | 
         
            +
                "respectively. To make our lives simple, these two objects are wrapped under a single class, called the [WhisperProcessor](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperProcessor). We can call the WhisperProcessor to perform \n",
         
     | 
| 328 | 
         
            +
                "both the audio pre-processing and the text token post-processing. In doing so, we only need to keep track of two objects during training: \n",
         
     | 
| 329 | 
         
            +
                "the `processor` and the `model`.\n",
         
     | 
| 330 | 
         
            +
                "\n",
         
     | 
| 331 | 
         
            +
                "If using a multilingual checkpoint, you should set the `\"language\"` to your target text language. You should also set the task to `\"transcribe\"` for speech recogntition and `\"translate\"` for speech translation. These arguments modify the behaviour of the tokenizer - they should be set correctly to ensure the target labels are encoded properly. These arguments should be omitted for English-only fine-tuning."
         
     | 
| 332 | 
         
            +
               ]
         
     | 
| 333 | 
         
            +
              },
         
     | 
| 334 | 
         
            +
              {
         
     | 
| 335 | 
         
            +
               "cell_type": "code",
         
     | 
| 336 | 
         
            +
               "execution_count": 9,
         
     | 
| 337 | 
         
            +
               "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6",
         
     | 
| 338 | 
         
            +
               "metadata": {},
         
     | 
| 339 | 
         
            +
               "outputs": [
         
     | 
| 340 | 
         
            +
                {
         
     | 
| 341 | 
         
            +
                 "data": {
         
     | 
| 342 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 343 | 
         
            +
                   "model_id": "9769d7a9ab1148b8af2bd69abf74d5d6",
         
     | 
| 344 | 
         
            +
                   "version_major": 2,
         
     | 
| 345 | 
         
            +
                   "version_minor": 0
         
     | 
| 346 | 
         
            +
                  },
         
     | 
| 347 | 
         
            +
                  "text/plain": [
         
     | 
| 348 | 
         
            +
                   "Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]"
         
     | 
| 349 | 
         
            +
                  ]
         
     | 
| 350 | 
         
            +
                 },
         
     | 
| 351 | 
         
            +
                 "metadata": {},
         
     | 
| 352 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 353 | 
         
            +
                },
         
     | 
| 354 | 
         
            +
                {
         
     | 
| 355 | 
         
            +
                 "data": {
         
     | 
| 356 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 357 | 
         
            +
                   "model_id": "a2b4d68d48d8439096430441c976bd21",
         
     | 
| 358 | 
         
            +
                   "version_major": 2,
         
     | 
| 359 | 
         
            +
                   "version_minor": 0
         
     | 
| 360 | 
         
            +
                  },
         
     | 
| 361 | 
         
            +
                  "text/plain": [
         
     | 
| 362 | 
         
            +
                   "Downloading:   0%|          | 0.00/837 [00:00<?, ?B/s]"
         
     | 
| 363 | 
         
            +
                  ]
         
     | 
| 364 | 
         
            +
                 },
         
     | 
| 365 | 
         
            +
                 "metadata": {},
         
     | 
| 366 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 367 | 
         
            +
                },
         
     | 
| 368 | 
         
            +
                {
         
     | 
| 369 | 
         
            +
                 "data": {
         
     | 
| 370 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 371 | 
         
            +
                   "model_id": "ceae9b86f1674939b330c81cb34c625a",
         
     | 
| 372 | 
         
            +
                   "version_major": 2,
         
     | 
| 373 | 
         
            +
                   "version_minor": 0
         
     | 
| 374 | 
         
            +
                  },
         
     | 
| 375 | 
         
            +
                  "text/plain": [
         
     | 
| 376 | 
         
            +
                   "Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
         
     | 
| 377 | 
         
            +
                  ]
         
     | 
| 378 | 
         
            +
                 },
         
     | 
| 379 | 
         
            +
                 "metadata": {},
         
     | 
| 380 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 381 | 
         
            +
                },
         
     | 
| 382 | 
         
            +
                {
         
     | 
| 383 | 
         
            +
                 "data": {
         
     | 
| 384 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 385 | 
         
            +
                   "model_id": "715ade22144945178519b742a88828d7",
         
     | 
| 386 | 
         
            +
                   "version_major": 2,
         
     | 
| 387 | 
         
            +
                   "version_minor": 0
         
     | 
| 388 | 
         
            +
                  },
         
     | 
| 389 | 
         
            +
                  "text/plain": [
         
     | 
| 390 | 
         
            +
                   "Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]"
         
     | 
| 391 | 
         
            +
                  ]
         
     | 
| 392 | 
         
            +
                 },
         
     | 
| 393 | 
         
            +
                 "metadata": {},
         
     | 
| 394 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 395 | 
         
            +
                },
         
     | 
| 396 | 
         
            +
                {
         
     | 
| 397 | 
         
            +
                 "data": {
         
     | 
| 398 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 399 | 
         
            +
                   "model_id": "381fff2e1ffa4331923ca1b4b3dc965d",
         
     | 
| 400 | 
         
            +
                   "version_major": 2,
         
     | 
| 401 | 
         
            +
                   "version_minor": 0
         
     | 
| 402 | 
         
            +
                  },
         
     | 
| 403 | 
         
            +
                  "text/plain": [
         
     | 
| 404 | 
         
            +
                   "Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
         
     | 
| 405 | 
         
            +
                  ]
         
     | 
| 406 | 
         
            +
                 },
         
     | 
| 407 | 
         
            +
                 "metadata": {},
         
     | 
| 408 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 409 | 
         
            +
                },
         
     | 
| 410 | 
         
            +
                {
         
     | 
| 411 | 
         
            +
                 "data": {
         
     | 
| 412 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 413 | 
         
            +
                   "model_id": "7cf108c742b8431187e1e3494610df3c",
         
     | 
| 414 | 
         
            +
                   "version_major": 2,
         
     | 
| 415 | 
         
            +
                   "version_minor": 0
         
     | 
| 416 | 
         
            +
                  },
         
     | 
| 417 | 
         
            +
                  "text/plain": [
         
     | 
| 418 | 
         
            +
                   "Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]"
         
     | 
| 419 | 
         
            +
                  ]
         
     | 
| 420 | 
         
            +
                 },
         
     | 
| 421 | 
         
            +
                 "metadata": {},
         
     | 
| 422 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 423 | 
         
            +
                },
         
     | 
| 424 | 
         
            +
                {
         
     | 
| 425 | 
         
            +
                 "data": {
         
     | 
| 426 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 427 | 
         
            +
                   "model_id": "31a51dd942054666b52dce912df102a3",
         
     | 
| 428 | 
         
            +
                   "version_major": 2,
         
     | 
| 429 | 
         
            +
                   "version_minor": 0
         
     | 
| 430 | 
         
            +
                  },
         
     | 
| 431 | 
         
            +
                  "text/plain": [
         
     | 
| 432 | 
         
            +
                   "Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]"
         
     | 
| 433 | 
         
            +
                  ]
         
     | 
| 434 | 
         
            +
                 },
         
     | 
| 435 | 
         
            +
                 "metadata": {},
         
     | 
| 436 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 437 | 
         
            +
                }
         
     | 
| 438 | 
         
            +
               ],
         
     | 
| 439 | 
         
            +
               "source": [
         
     | 
| 440 | 
         
            +
                "from transformers import WhisperProcessor\n",
         
     | 
| 441 | 
         
            +
                "\n",
         
     | 
| 442 | 
         
            +
                "processor = WhisperProcessor.from_pretrained(\"juancopi81/whisper-medium-es-common-fleurs\", language=\"Spanish\", task=\"transcribe\")"
         
     | 
| 443 | 
         
            +
               ]
         
     | 
| 444 | 
         
            +
              },
         
     | 
| 445 | 
         
            +
              {
         
     | 
| 446 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 447 | 
         
            +
               "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c",
         
     | 
| 448 | 
         
            +
               "metadata": {},
         
     | 
| 449 | 
         
            +
               "source": [
         
     | 
| 450 | 
         
            +
                "### Pre-Process Data"
         
     | 
| 451 | 
         
            +
               ]
         
     | 
| 452 | 
         
            +
              },
         
     | 
| 453 | 
         
            +
              {
         
     | 
| 454 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 455 | 
         
            +
               "id": "bf10cd3e-924e-44fc-8790-46e413de7b3d",
         
     | 
| 456 | 
         
            +
               "metadata": {},
         
     | 
| 457 | 
         
            +
               "source": [
         
     | 
| 458 | 
         
            +
                "Let's have a look at the dataset features. Pay particular attention to the `\"audio\"` column - this details the sampling rate of our audio inputs:"
         
     | 
| 459 | 
         
            +
               ]
         
     | 
| 460 | 
         
            +
              },
         
     | 
| 461 | 
         
            +
              {
         
     | 
| 462 | 
         
            +
               "cell_type": "code",
         
     | 
| 463 | 
         
            +
               "execution_count": 10,
         
     | 
| 464 | 
         
            +
               "id": "ab5a13b4-9bd4-4aa0-aef2-b3de9b762988",
         
     | 
| 465 | 
         
            +
               "metadata": {},
         
     | 
| 466 | 
         
            +
               "outputs": [
         
     | 
| 467 | 
         
            +
                {
         
     | 
| 468 | 
         
            +
                 "data": {
         
     | 
| 469 | 
         
            +
                  "text/plain": [
         
     | 
| 470 | 
         
            +
                   "{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),\n",
         
     | 
| 471 | 
         
            +
                   " 'sentence': Value(dtype='string', id=None)}"
         
     | 
| 472 | 
         
            +
                  ]
         
     | 
| 473 | 
         
            +
                 },
         
     | 
| 474 | 
         
            +
                 "execution_count": 10,
         
     | 
| 475 | 
         
            +
                 "metadata": {},
         
     | 
| 476 | 
         
            +
                 "output_type": "execute_result"
         
     | 
| 477 | 
         
            +
                }
         
     | 
| 478 | 
         
            +
               ],
         
     | 
| 479 | 
         
            +
               "source": [
         
     | 
| 480 | 
         
            +
                "raw_datasets[\"train\"].features"
         
     | 
| 481 | 
         
            +
               ]
         
     | 
| 482 | 
         
            +
              },
         
     | 
| 483 | 
         
            +
              {
         
     | 
| 484 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 485 | 
         
            +
               "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd",
         
     | 
| 486 | 
         
            +
               "metadata": {},
         
     | 
| 487 | 
         
            +
               "source": [
         
     | 
| 488 | 
         
            +
                "Since our input audio is sampled at 48kHz, we need to _downsample_ it to\n",
         
     | 
| 489 | 
         
            +
                "16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. \n",
         
     | 
| 490 | 
         
            +
                "\n",
         
     | 
| 491 | 
         
            +
                "We'll set the audio inputs to the correct sampling rate using dataset's \n",
         
     | 
| 492 | 
         
            +
                "[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)\n",
         
     | 
| 493 | 
         
            +
                "method. This operation does not change the audio in-place, \n",
         
     | 
| 494 | 
         
            +
                "but rather signals to `datasets` to resample audio samples _on the fly_ the \n",
         
     | 
| 495 | 
         
            +
                "first time that they are loaded:"
         
     | 
| 496 | 
         
            +
               ]
         
     | 
| 497 | 
         
            +
              },
         
     | 
| 498 | 
         
            +
              {
         
     | 
| 499 | 
         
            +
               "cell_type": "code",
         
     | 
| 500 | 
         
            +
               "execution_count": 11,
         
     | 
| 501 | 
         
            +
               "id": "3ab6a724-3d1e-478b-a9e9-d2f85feb6c39",
         
     | 
| 502 | 
         
            +
               "metadata": {},
         
     | 
| 503 | 
         
            +
               "outputs": [],
         
     | 
| 504 | 
         
            +
               "source": [
         
     | 
| 505 | 
         
            +
                "from datasets import Audio\n",
         
     | 
| 506 | 
         
            +
                "\n",
         
     | 
| 507 | 
         
            +
                "raw_datasets = raw_datasets.cast_column(\"audio\", Audio(sampling_rate=16000))"
         
     | 
| 508 | 
         
            +
               ]
         
     | 
| 509 | 
         
            +
              },
         
     | 
| 510 | 
         
            +
              {
         
     | 
| 511 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 512 | 
         
            +
               "id": "161322c2-94f3-4d26-9e1d-d9d5202ca3cf",
         
     | 
| 513 | 
         
            +
               "metadata": {},
         
     | 
| 514 | 
         
            +
               "source": [
         
     | 
| 515 | 
         
            +
                "We'll define our pre-processing strategy. We advise that you **do not** lower-case the transcriptions or remove punctuation unless mixing different datasets. This will enable you to fine-tune Whisper models that can predict punctuation and casing. Later, you will see how we can evaluate the predictions without punctuation or casing, so that the models benefit from the WER improvement obtained by normalising the transcriptions while still predicting fully formatted transcriptions."
         
     | 
| 516 | 
         
            +
               ]
         
     | 
| 517 | 
         
            +
              },
         
     | 
| 518 | 
         
            +
              {
         
     | 
| 519 | 
         
            +
               "cell_type": "code",
         
     | 
| 520 | 
         
            +
               "execution_count": 12,
         
     | 
| 521 | 
         
            +
               "id": "d041650e-1c48-4439-87b3-5b6f4a514107",
         
     | 
| 522 | 
         
            +
               "metadata": {},
         
     | 
| 523 | 
         
            +
               "outputs": [],
         
     | 
| 524 | 
         
            +
               "source": [
         
     | 
| 525 | 
         
            +
                "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
         
     | 
| 526 | 
         
            +
                "\n",
         
     | 
| 527 | 
         
            +
                "do_lower_case = False\n",
         
     | 
| 528 | 
         
            +
                "do_remove_punctuation = False\n",
         
     | 
| 529 | 
         
            +
                "\n",
         
     | 
| 530 | 
         
            +
                "normalizer = BasicTextNormalizer()"
         
     | 
| 531 | 
         
            +
               ]
         
     | 
| 532 | 
         
            +
              },
         
     | 
| 533 | 
         
            +
              {
         
     | 
| 534 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 535 | 
         
            +
               "id": "bfaa935b-a11d-497c-88c1-0c4d1bb3247b",
         
     | 
| 536 | 
         
            +
               "metadata": {},
         
     | 
| 537 | 
         
            +
               "source": [
         
     | 
| 538 | 
         
            +
                "Now we can write a function to prepare our data ready for the model:\n",
         
     | 
| 539 | 
         
            +
                "1. We load and resample the audio data by calling `batch[\"audio\"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.\n",
         
     | 
| 540 | 
         
            +
                "2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.\n",
         
     | 
| 541 | 
         
            +
                "3. We perform any optional pre-processing (lower-case or remove punctuation).\n",
         
     | 
| 542 | 
         
            +
                "4. We encode the transcriptions to label ids through the use of the tokenizer."
         
     | 
| 543 | 
         
            +
               ]
         
     | 
| 544 | 
         
            +
              },
         
     | 
| 545 | 
         
            +
              {
         
     | 
| 546 | 
         
            +
               "cell_type": "code",
         
     | 
| 547 | 
         
            +
               "execution_count": 13,
         
     | 
| 548 | 
         
            +
               "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
         
     | 
| 549 | 
         
            +
               "metadata": {},
         
     | 
| 550 | 
         
            +
               "outputs": [],
         
     | 
| 551 | 
         
            +
               "source": [
         
     | 
| 552 | 
         
            +
                "def prepare_dataset(batch):\n",
         
     | 
| 553 | 
         
            +
                "    # load and (possibly) resample audio data to 16kHz\n",
         
     | 
| 554 | 
         
            +
                "    audio = batch[\"audio\"]\n",
         
     | 
| 555 | 
         
            +
                "\n",
         
     | 
| 556 | 
         
            +
                "    # compute log-Mel input features from input audio array \n",
         
     | 
| 557 | 
         
            +
                "    batch[\"input_features\"] = processor.feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n",
         
     | 
| 558 | 
         
            +
                "    # compute input length of audio sample in seconds\n",
         
     | 
| 559 | 
         
            +
                "    batch[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
         
     | 
| 560 | 
         
            +
                "    \n",
         
     | 
| 561 | 
         
            +
                "    # optional pre-processing steps\n",
         
     | 
| 562 | 
         
            +
                "    transcription = batch[\"sentence\"]\n",
         
     | 
| 563 | 
         
            +
                "    if do_lower_case:\n",
         
     | 
| 564 | 
         
            +
                "        transcription = transcription.lower()\n",
         
     | 
| 565 | 
         
            +
                "    if do_remove_punctuation:\n",
         
     | 
| 566 | 
         
            +
                "        transcription = normalizer(transcription).strip()\n",
         
     | 
| 567 | 
         
            +
                "    \n",
         
     | 
| 568 | 
         
            +
                "    # encode target text to label ids\n",
         
     | 
| 569 | 
         
            +
                "    batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
         
     | 
| 570 | 
         
            +
                "    return batch"
         
     | 
| 571 | 
         
            +
               ]
         
     | 
| 572 | 
         
            +
              },
         
     | 
| 573 | 
         
            +
              {
         
     | 
| 574 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 575 | 
         
            +
               "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13",
         
     | 
| 576 | 
         
            +
               "metadata": {},
         
     | 
| 577 | 
         
            +
               "source": [
         
     | 
| 578 | 
         
            +
                "We can apply the data preparation function to all of our training examples using 🤗 Datasets' `.map` method. We'll remove all of the columns from the raw training data, leaving just the `input_features` and `labels` defined in the `prepare_dataset` function:"
         
     | 
| 579 | 
         
            +
               ]
         
     | 
| 580 | 
         
            +
              },
         
     | 
| 581 | 
         
            +
              {
         
     | 
| 582 | 
         
            +
               "cell_type": "code",
         
     | 
| 583 | 
         
            +
               "execution_count": 14,
         
     | 
| 584 | 
         
            +
               "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
         
     | 
| 585 | 
         
            +
               "metadata": {},
         
     | 
| 586 | 
         
            +
               "outputs": [],
         
     | 
| 587 | 
         
            +
               "source": [
         
     | 
| 588 | 
         
            +
                "vectorized_datasets = raw_datasets.map(prepare_dataset, remove_columns=list(next(iter(raw_datasets.values())).features)).with_format(\"torch\")"
         
     | 
| 589 | 
         
            +
               ]
         
     | 
| 590 | 
         
            +
              },
         
     | 
| 591 | 
         
            +
              {
         
     | 
| 592 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 593 | 
         
            +
               "id": "3d59b37e-4950-47ec-9e3e-2cf2ec7fc750",
         
     | 
| 594 | 
         
            +
               "metadata": {},
         
     | 
| 595 | 
         
            +
               "source": [
         
     | 
| 596 | 
         
            +
                "We can now define how we shuffle the data in the train split. The size of the subset we load is set by the variable `buffer_size`. You can increase or decrease this depending on your memory constraints. In this example, the `buffer_size` is set to 500, meaning 500 samples are loaded before shuffling across the subset. The larger we set this value, the closer to True offline shuffling. The `seed` is set for reproducibility:"
         
     | 
| 597 | 
         
            +
               ]
         
     | 
| 598 | 
         
            +
              },
         
     | 
| 599 | 
         
            +
              {
         
     | 
| 600 | 
         
            +
               "cell_type": "code",
         
     | 
| 601 | 
         
            +
               "execution_count": 15,
         
     | 
| 602 | 
         
            +
               "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
         
     | 
| 603 | 
         
            +
               "metadata": {},
         
     | 
| 604 | 
         
            +
               "outputs": [],
         
     | 
| 605 | 
         
            +
               "source": [
         
     | 
| 606 | 
         
            +
                "vectorized_datasets[\"train\"] = vectorized_datasets[\"train\"].shuffle(\n",
         
     | 
| 607 | 
         
            +
                "    buffer_size=500,\n",
         
     | 
| 608 | 
         
            +
                "    seed=0,\n",
         
     | 
| 609 | 
         
            +
                ")"
         
     | 
| 610 | 
         
            +
               ]
         
     | 
| 611 | 
         
            +
              },
         
     | 
| 612 | 
         
            +
              {
         
     | 
| 613 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 614 | 
         
            +
               "id": "666b9ef0-7909-4e1e-a419-87604d233e29",
         
     | 
| 615 | 
         
            +
               "metadata": {},
         
     | 
| 616 | 
         
            +
               "source": [
         
     | 
| 617 | 
         
            +
                "Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for samples that are less than 30s, and `False` for those that are longer:"
         
     | 
| 618 | 
         
            +
               ]
         
     | 
| 619 | 
         
            +
              },
         
     | 
| 620 | 
         
            +
              {
         
     | 
| 621 | 
         
            +
               "cell_type": "code",
         
     | 
| 622 | 
         
            +
               "execution_count": 16,
         
     | 
| 623 | 
         
            +
               "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
         
     | 
| 624 | 
         
            +
               "metadata": {},
         
     | 
| 625 | 
         
            +
               "outputs": [],
         
     | 
| 626 | 
         
            +
               "source": [
         
     | 
| 627 | 
         
            +
                "max_input_length = 30.0\n",
         
     | 
| 628 | 
         
            +
                "\n",
         
     | 
| 629 | 
         
            +
                "def is_audio_in_length_range(length):\n",
         
     | 
| 630 | 
         
            +
                "    return length < max_input_length"
         
     | 
| 631 | 
         
            +
               ]
         
     | 
| 632 | 
         
            +
              },
         
     | 
| 633 | 
         
            +
              {
         
     | 
| 634 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 635 | 
         
            +
               "id": "28e37ac3-b1c5-465b-8586-7cfd8d76b0f1",
         
     | 
| 636 | 
         
            +
               "metadata": {},
         
     | 
| 637 | 
         
            +
               "source": [
         
     | 
| 638 | 
         
            +
                "We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method:"
         
     | 
| 639 | 
         
            +
               ]
         
     | 
| 640 | 
         
            +
              },
         
     | 
| 641 | 
         
            +
              {
         
     | 
| 642 | 
         
            +
               "cell_type": "code",
         
     | 
| 643 | 
         
            +
               "execution_count": 17,
         
     | 
| 644 | 
         
            +
               "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
         
     | 
| 645 | 
         
            +
               "metadata": {},
         
     | 
| 646 | 
         
            +
               "outputs": [],
         
     | 
| 647 | 
         
            +
               "source": [
         
     | 
| 648 | 
         
            +
                "vectorized_datasets[\"train\"] = vectorized_datasets[\"train\"].filter(\n",
         
     | 
| 649 | 
         
            +
                "    is_audio_in_length_range,\n",
         
     | 
| 650 | 
         
            +
                "    input_columns=[\"input_length\"],\n",
         
     | 
| 651 | 
         
            +
                ")"
         
     | 
| 652 | 
         
            +
               ]
         
     | 
| 653 | 
         
            +
              },
         
     | 
| 654 | 
         
            +
              {
         
     | 
| 655 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 656 | 
         
            +
               "id": "263a5a58-0239-4a25-b0df-c625fc9c5810",
         
     | 
| 657 | 
         
            +
               "metadata": {},
         
     | 
| 658 | 
         
            +
               "source": [
         
     | 
| 659 | 
         
            +
                "## Training and Evaluation"
         
     | 
| 660 | 
         
            +
               ]
         
     | 
| 661 | 
         
            +
              },
         
     | 
| 662 | 
         
            +
              {
         
     | 
| 663 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 664 | 
         
            +
               "id": "a693e768-c5a6-453f-89a1-b601dcf7daf7",
         
     | 
| 665 | 
         
            +
               "metadata": {},
         
     | 
| 666 | 
         
            +
               "source": [
         
     | 
| 667 | 
         
            +
                "Now that we've prepared our data, we're ready to dive into the training pipeline. \n",
         
     | 
| 668 | 
         
            +
                "The [🤗 Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer)\n",
         
     | 
| 669 | 
         
            +
                "will do much of the heavy lifting for us. All we have to do is:\n",
         
     | 
| 670 | 
         
            +
                "\n",
         
     | 
| 671 | 
         
            +
                "- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.\n",
         
     | 
| 672 | 
         
            +
                "\n",
         
     | 
| 673 | 
         
            +
                "- Evaluation metrics: during evaluation, we want to evaluate the model using the [word error rate (WER)](https://huggingface.co/metrics/wer) metric. We need to define a `compute_metrics` function that handles this computation.\n",
         
     | 
| 674 | 
         
            +
                "\n",
         
     | 
| 675 | 
         
            +
                "- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.\n",
         
     | 
| 676 | 
         
            +
                "\n",
         
     | 
| 677 | 
         
            +
                "- Define the training configuration: this will be used by the 🤗 Trainer to define the training schedule."
         
     | 
| 678 | 
         
            +
               ]
         
     | 
| 679 | 
         
            +
              },
         
     | 
| 680 | 
         
            +
              {
         
     | 
| 681 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 682 | 
         
            +
               "id": "8d230e6d-624c-400a-bbf5-fa660881df25",
         
     | 
| 683 | 
         
            +
               "metadata": {},
         
     | 
| 684 | 
         
            +
               "source": [
         
     | 
| 685 | 
         
            +
                "### Define a Data Collator"
         
     | 
| 686 | 
         
            +
               ]
         
     | 
| 687 | 
         
            +
              },
         
     | 
| 688 | 
         
            +
              {
         
     | 
| 689 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 690 | 
         
            +
               "id": "04def221-0637-4a69-b242-d3f0c1d0ee78",
         
     | 
| 691 | 
         
            +
               "metadata": {},
         
     | 
| 692 | 
         
            +
               "source": [
         
     | 
| 693 | 
         
            +
                "The data collator for a sequence-to-sequence speech model is unique in the sense that it \n",
         
     | 
| 694 | 
         
            +
                "treats the `input_features` and `labels` independently: the  `input_features` must be \n",
         
     | 
| 695 | 
         
            +
                "handled by the feature extractor and the `labels` by the tokenizer.\n",
         
     | 
| 696 | 
         
            +
                "\n",
         
     | 
| 697 | 
         
            +
                "The `input_features` are already padded to 30s and converted to a log-Mel spectrogram \n",
         
     | 
| 698 | 
         
            +
                "of fixed dimension by action of the feature extractor, so all we have to do is convert the `input_features`\n",
         
     | 
| 699 | 
         
            +
                "to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with `return_tensors=pt`.\n",
         
     | 
| 700 | 
         
            +
                "\n",
         
     | 
| 701 | 
         
            +
                "The `labels` on the other hand are un-padded. We first pad the sequences\n",
         
     | 
| 702 | 
         
            +
                "to the maximum length in the batch using the tokenizer's `.pad` method. The padding tokens \n",
         
     | 
| 703 | 
         
            +
                "are then replaced by `-100` so that these tokens are **not** taken into account when \n",
         
     | 
| 704 | 
         
            +
                "computing the loss. We then cut the BOS token from the start of the label sequence as we \n",
         
     | 
| 705 | 
         
            +
                "append it later during training.\n",
         
     | 
| 706 | 
         
            +
                "\n",
         
     | 
| 707 | 
         
            +
                "We can leverage the `WhisperProcessor` we defined earlier to perform both the \n",
         
     | 
| 708 | 
         
            +
                "feature extractor and the tokenizer operations:"
         
     | 
| 709 | 
         
            +
               ]
         
     | 
| 710 | 
         
            +
              },
         
     | 
| 711 | 
         
            +
              {
         
     | 
| 712 | 
         
            +
               "cell_type": "code",
         
     | 
| 713 | 
         
            +
               "execution_count": 18,
         
     | 
| 714 | 
         
            +
               "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5",
         
     | 
| 715 | 
         
            +
               "metadata": {},
         
     | 
| 716 | 
         
            +
               "outputs": [],
         
     | 
| 717 | 
         
            +
               "source": [
         
     | 
| 718 | 
         
            +
                "import torch\n",
         
     | 
| 719 | 
         
            +
                "\n",
         
     | 
| 720 | 
         
            +
                "from dataclasses import dataclass\n",
         
     | 
| 721 | 
         
            +
                "from typing import Any, Dict, List, Union\n",
         
     | 
| 722 | 
         
            +
                "\n",
         
     | 
| 723 | 
         
            +
                "@dataclass\n",
         
     | 
| 724 | 
         
            +
                "class DataCollatorSpeechSeq2SeqWithPadding:\n",
         
     | 
| 725 | 
         
            +
                "    processor: Any\n",
         
     | 
| 726 | 
         
            +
                "\n",
         
     | 
| 727 | 
         
            +
                "    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
         
     | 
| 728 | 
         
            +
                "        # split inputs and labels since they have to be of different lengths and need different padding methods\n",
         
     | 
| 729 | 
         
            +
                "        # first treat the audio inputs by simply returning torch tensors\n",
         
     | 
| 730 | 
         
            +
                "        input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n",
         
     | 
| 731 | 
         
            +
                "        batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
         
     | 
| 732 | 
         
            +
                "\n",
         
     | 
| 733 | 
         
            +
                "        # get the tokenized label sequences\n",
         
     | 
| 734 | 
         
            +
                "        label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
         
     | 
| 735 | 
         
            +
                "        # pad the labels to max length\n",
         
     | 
| 736 | 
         
            +
                "        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
         
     | 
| 737 | 
         
            +
                "\n",
         
     | 
| 738 | 
         
            +
                "        # replace padding with -100 to ignore loss correctly\n",
         
     | 
| 739 | 
         
            +
                "        labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n",
         
     | 
| 740 | 
         
            +
                "\n",
         
     | 
| 741 | 
         
            +
                "        # if bos token is appended in previous tokenization step,\n",
         
     | 
| 742 | 
         
            +
                "        # cut bos token here as it's append later anyways\n",
         
     | 
| 743 | 
         
            +
                "        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
         
     | 
| 744 | 
         
            +
                "            labels = labels[:, 1:]\n",
         
     | 
| 745 | 
         
            +
                "\n",
         
     | 
| 746 | 
         
            +
                "        batch[\"labels\"] = labels\n",
         
     | 
| 747 | 
         
            +
                "\n",
         
     | 
| 748 | 
         
            +
                "        return batch"
         
     | 
| 749 | 
         
            +
               ]
         
     | 
| 750 | 
         
            +
              },
         
     | 
| 751 | 
         
            +
              {
         
     | 
| 752 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 753 | 
         
            +
               "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86",
         
     | 
| 754 | 
         
            +
               "metadata": {},
         
     | 
| 755 | 
         
            +
               "source": [
         
     | 
| 756 | 
         
            +
                "Let's initialise the data collator we've just defined:"
         
     | 
| 757 | 
         
            +
               ]
         
     | 
| 758 | 
         
            +
              },
         
     | 
| 759 | 
         
            +
              {
         
     | 
| 760 | 
         
            +
               "cell_type": "code",
         
     | 
| 761 | 
         
            +
               "execution_count": 19,
         
     | 
| 762 | 
         
            +
               "id": "fc834702-c0d3-4a96-b101-7b87be32bf42",
         
     | 
| 763 | 
         
            +
               "metadata": {},
         
     | 
| 764 | 
         
            +
               "outputs": [],
         
     | 
| 765 | 
         
            +
               "source": [
         
     | 
| 766 | 
         
            +
                "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
         
     | 
| 767 | 
         
            +
               ]
         
     | 
| 768 | 
         
            +
              },
         
     | 
| 769 | 
         
            +
              {
         
     | 
| 770 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 771 | 
         
            +
               "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698",
         
     | 
| 772 | 
         
            +
               "metadata": {},
         
     | 
| 773 | 
         
            +
               "source": [
         
     | 
| 774 | 
         
            +
                "### Evaluation Metrics"
         
     | 
| 775 | 
         
            +
               ]
         
     | 
| 776 | 
         
            +
              },
         
     | 
| 777 | 
         
            +
              {
         
     | 
| 778 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 779 | 
         
            +
               "id": "66fee1a7-a44c-461e-b047-c3917221572e",
         
     | 
| 780 | 
         
            +
               "metadata": {},
         
     | 
| 781 | 
         
            +
               "source": [
         
     | 
| 782 | 
         
            +
                "We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing \n",
         
     | 
| 783 | 
         
            +
                "ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:"
         
     | 
| 784 | 
         
            +
               ]
         
     | 
| 785 | 
         
            +
              },
         
     | 
| 786 | 
         
            +
              {
         
     | 
| 787 | 
         
            +
               "cell_type": "code",
         
     | 
| 788 | 
         
            +
               "execution_count": 20,
         
     | 
| 789 | 
         
            +
               "id": "b22b4011-f31f-4b57-b684-c52332f92890",
         
     | 
| 790 | 
         
            +
               "metadata": {},
         
     | 
| 791 | 
         
            +
               "outputs": [],
         
     | 
| 792 | 
         
            +
               "source": [
         
     | 
| 793 | 
         
            +
                "import evaluate\n",
         
     | 
| 794 | 
         
            +
                "\n",
         
     | 
| 795 | 
         
            +
                "metric = evaluate.load(\"wer\")"
         
     | 
| 796 | 
         
            +
               ]
         
     | 
| 797 | 
         
            +
              },
         
     | 
| 798 | 
         
            +
              {
         
     | 
| 799 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 800 | 
         
            +
               "id": "509f96d7-3f11-4f37-add9-f74a0c44f3fc",
         
     | 
| 801 | 
         
            +
               "metadata": {},
         
     | 
| 802 | 
         
            +
               "source": [
         
     | 
| 803 | 
         
            +
                "We then simply have to define a function that takes our model \n",
         
     | 
| 804 | 
         
            +
                "predictions and returns the WER metric. This function, called\n",
         
     | 
| 805 | 
         
            +
                "`compute_metrics`, first replaces `-100` with the `pad_token_id`\n",
         
     | 
| 806 | 
         
            +
                "in the `label_ids` (undoing the step we applied in the \n",
         
     | 
| 807 | 
         
            +
                "data collator to ignore padded tokens correctly in the loss).\n",
         
     | 
| 808 | 
         
            +
                "It then decodes the predicted and label ids to strings. Finally,\n",
         
     | 
| 809 | 
         
            +
                "it computes the WER between the predictions and reference labels. \n",
         
     | 
| 810 | 
         
            +
                "Here, we have the option of evaluating with the 'normalised' transcriptions \n",
         
     | 
| 811 | 
         
            +
                "and predictions. We recommend you set this to `True` to benefit from the WER \n",
         
     | 
| 812 | 
         
            +
                "improvement obtained by normalising the transcriptions."
         
     | 
| 813 | 
         
            +
               ]
         
     | 
| 814 | 
         
            +
              },
         
     | 
| 815 | 
         
            +
              {
         
     | 
| 816 | 
         
            +
               "cell_type": "code",
         
     | 
| 817 | 
         
            +
               "execution_count": 21,
         
     | 
| 818 | 
         
            +
               "id": "a11d1bfc-9e28-460f-a287-72d8f7bc1acb",
         
     | 
| 819 | 
         
            +
               "metadata": {},
         
     | 
| 820 | 
         
            +
               "outputs": [],
         
     | 
| 821 | 
         
            +
               "source": [
         
     | 
| 822 | 
         
            +
                "# evaluate with the 'normalised' WER\n",
         
     | 
| 823 | 
         
            +
                "do_normalize_eval = True\n",
         
     | 
| 824 | 
         
            +
                "\n",
         
     | 
| 825 | 
         
            +
                "def compute_metrics(pred):\n",
         
     | 
| 826 | 
         
            +
                "    pred_ids = pred.predictions\n",
         
     | 
| 827 | 
         
            +
                "    label_ids = pred.label_ids\n",
         
     | 
| 828 | 
         
            +
                "\n",
         
     | 
| 829 | 
         
            +
                "    # replace -100 with the pad_token_id\n",
         
     | 
| 830 | 
         
            +
                "    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
         
     | 
| 831 | 
         
            +
                "\n",
         
     | 
| 832 | 
         
            +
                "    # we do not want to group tokens when computing the metrics\n",
         
     | 
| 833 | 
         
            +
                "    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n",
         
     | 
| 834 | 
         
            +
                "    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n",
         
     | 
| 835 | 
         
            +
                "\n",
         
     | 
| 836 | 
         
            +
                "    if do_normalize_eval:\n",
         
     | 
| 837 | 
         
            +
                "        pred_str = [normalizer(pred) for pred in pred_str]\n",
         
     | 
| 838 | 
         
            +
                "        label_str = [normalizer(label) for label in label_str]\n",
         
     | 
| 839 | 
         
            +
                "        # filtering step to only evaluate the samples that correspond to non-zero references:\n",
         
     | 
| 840 | 
         
            +
                "        pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0]\n",
         
     | 
| 841 | 
         
            +
                "        label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0]\n",
         
     | 
| 842 | 
         
            +
                "\n",
         
     | 
| 843 | 
         
            +
                "    wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
         
     | 
| 844 | 
         
            +
                "\n",
         
     | 
| 845 | 
         
            +
                "    return {\"wer\": wer}"
         
     | 
| 846 | 
         
            +
               ]
         
     | 
| 847 | 
         
            +
              },
         
     | 
| 848 | 
         
            +
              {
         
     | 
| 849 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 850 | 
         
            +
               "id": "daf2a825-6d9f-4a23-b145-c37c0039075b",
         
     | 
| 851 | 
         
            +
               "metadata": {},
         
     | 
| 852 | 
         
            +
               "source": [
         
     | 
| 853 | 
         
            +
                "### Load a Pre-Trained Checkpoint"
         
     | 
| 854 | 
         
            +
               ]
         
     | 
| 855 | 
         
            +
              },
         
     | 
| 856 | 
         
            +
              {
         
     | 
| 857 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 858 | 
         
            +
               "id": "437a97fa-4864-476b-8abc-f28b8166cfa5",
         
     | 
| 859 | 
         
            +
               "metadata": {},
         
     | 
| 860 | 
         
            +
               "source": [
         
     | 
| 861 | 
         
            +
                "Now let's load the pre-trained Whisper `small` checkpoint. Again, this \n",
         
     | 
| 862 | 
         
            +
                "is trivial through use of 🤗 Transformers!"
         
     | 
| 863 | 
         
            +
               ]
         
     | 
| 864 | 
         
            +
              },
         
     | 
| 865 | 
         
            +
              {
         
     | 
| 866 | 
         
            +
               "cell_type": "code",
         
     | 
| 867 | 
         
            +
               "execution_count": 22,
         
     | 
| 868 | 
         
            +
               "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f",
         
     | 
| 869 | 
         
            +
               "metadata": {},
         
     | 
| 870 | 
         
            +
               "outputs": [
         
     | 
| 871 | 
         
            +
                {
         
     | 
| 872 | 
         
            +
                 "data": {
         
     | 
| 873 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 874 | 
         
            +
                   "model_id": "3b21514a2fff4878a2f569d2cc28b925",
         
     | 
| 875 | 
         
            +
                   "version_major": 2,
         
     | 
| 876 | 
         
            +
                   "version_minor": 0
         
     | 
| 877 | 
         
            +
                  },
         
     | 
| 878 | 
         
            +
                  "text/plain": [
         
     | 
| 879 | 
         
            +
                   "Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]"
         
     | 
| 880 | 
         
            +
                  ]
         
     | 
| 881 | 
         
            +
                 },
         
     | 
| 882 | 
         
            +
                 "metadata": {},
         
     | 
| 883 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 884 | 
         
            +
                },
         
     | 
| 885 | 
         
            +
                {
         
     | 
| 886 | 
         
            +
                 "data": {
         
     | 
| 887 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 888 | 
         
            +
                   "model_id": "28d70b74dbd844328ad9d325c9babfe1",
         
     | 
| 889 | 
         
            +
                   "version_major": 2,
         
     | 
| 890 | 
         
            +
                   "version_minor": 0
         
     | 
| 891 | 
         
            +
                  },
         
     | 
| 892 | 
         
            +
                  "text/plain": [
         
     | 
| 893 | 
         
            +
                   "Downloading:   0%|          | 0.00/3.06G [00:00<?, ?B/s]"
         
     | 
| 894 | 
         
            +
                  ]
         
     | 
| 895 | 
         
            +
                 },
         
     | 
| 896 | 
         
            +
                 "metadata": {},
         
     | 
| 897 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 898 | 
         
            +
                }
         
     | 
| 899 | 
         
            +
               ],
         
     | 
| 900 | 
         
            +
               "source": [
         
     | 
| 901 | 
         
            +
                "from transformers import WhisperForConditionalGeneration\n",
         
     | 
| 902 | 
         
            +
                "\n",
         
     | 
| 903 | 
         
            +
                "model = WhisperForConditionalGeneration.from_pretrained(\"juancopi81/whisper-medium-es-common-fleurs\")"
         
     | 
| 904 | 
         
            +
               ]
         
     | 
| 905 | 
         
            +
              },
         
     | 
| 906 | 
         
            +
              {
         
     | 
| 907 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 908 | 
         
            +
               "id": "a15ead5f-2277-4a39-937b-585c2497b2df",
         
     | 
| 909 | 
         
            +
               "metadata": {},
         
     | 
| 910 | 
         
            +
               "source": [
         
     | 
| 911 | 
         
            +
                "Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)). Set `use_cache` to False since we're using gradient checkpointing, and the two are incompatible:"
         
     | 
| 912 | 
         
            +
               ]
         
     | 
| 913 | 
         
            +
              },
         
     | 
| 914 | 
         
            +
              {
         
     | 
| 915 | 
         
            +
               "cell_type": "code",
         
     | 
| 916 | 
         
            +
               "execution_count": 23,
         
     | 
| 917 | 
         
            +
               "id": "62038ba3-88ed-4fce-84db-338f50dcd04f",
         
     | 
| 918 | 
         
            +
               "metadata": {},
         
     | 
| 919 | 
         
            +
               "outputs": [],
         
     | 
| 920 | 
         
            +
               "source": [
         
     | 
| 921 | 
         
            +
                "model.config.forced_decoder_ids = None\n",
         
     | 
| 922 | 
         
            +
                "model.config.suppress_tokens = []\n",
         
     | 
| 923 | 
         
            +
                "model.config.use_cache = False"
         
     | 
| 924 | 
         
            +
               ]
         
     | 
| 925 | 
         
            +
              },
         
     | 
| 926 | 
         
            +
              {
         
     | 
| 927 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 928 | 
         
            +
               "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06",
         
     | 
| 929 | 
         
            +
               "metadata": {},
         
     | 
| 930 | 
         
            +
               "source": [
         
     | 
| 931 | 
         
            +
                "### Define the Training Configuration"
         
     | 
| 932 | 
         
            +
               ]
         
     | 
| 933 | 
         
            +
              },
         
     | 
| 934 | 
         
            +
              {
         
     | 
| 935 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 936 | 
         
            +
               "id": "c21af1e9-0188-4134-ac82-defc7bdcc436",
         
     | 
| 937 | 
         
            +
               "metadata": {},
         
     | 
| 938 | 
         
            +
               "source": [
         
     | 
| 939 | 
         
            +
                "In the final step, we define all the parameters related to training. Here, you can set the `max_steps` to train for longer. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)."
         
     | 
| 940 | 
         
            +
               ]
         
     | 
| 941 | 
         
            +
              },
         
     | 
| 942 | 
         
            +
              {
         
     | 
| 943 | 
         
            +
               "cell_type": "code",
         
     | 
| 944 | 
         
            +
               "execution_count": 24,
         
     | 
| 945 | 
         
            +
               "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a",
         
     | 
| 946 | 
         
            +
               "metadata": {},
         
     | 
| 947 | 
         
            +
               "outputs": [],
         
     | 
| 948 | 
         
            +
               "source": [
         
     | 
| 949 | 
         
            +
                "from transformers import Seq2SeqTrainingArguments\n",
         
     | 
| 950 | 
         
            +
                "\n",
         
     | 
| 951 | 
         
            +
                "training_args = Seq2SeqTrainingArguments(\n",
         
     | 
| 952 | 
         
            +
                "    output_dir=\"./\",\n",
         
     | 
| 953 | 
         
            +
                "    per_device_train_batch_size=32,\n",
         
     | 
| 954 | 
         
            +
                "    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size\n",
         
     | 
| 955 | 
         
            +
                "    learning_rate=1e-5,\n",
         
     | 
| 956 | 
         
            +
                "    warmup_steps=500,\n",
         
     | 
| 957 | 
         
            +
                "    max_steps=5000,\n",
         
     | 
| 958 | 
         
            +
                "    gradient_checkpointing=True,\n",
         
     | 
| 959 | 
         
            +
                "    fp16=True,\n",
         
     | 
| 960 | 
         
            +
                "    evaluation_strategy=\"steps\",\n",
         
     | 
| 961 | 
         
            +
                "    per_device_eval_batch_size=16,\n",
         
     | 
| 962 | 
         
            +
                "    predict_with_generate=True,\n",
         
     | 
| 963 | 
         
            +
                "    generation_max_length=225,\n",
         
     | 
| 964 | 
         
            +
                "    save_steps=1000,\n",
         
     | 
| 965 | 
         
            +
                "    eval_steps=1000,\n",
         
     | 
| 966 | 
         
            +
                "    logging_steps=25,\n",
         
     | 
| 967 | 
         
            +
                "    report_to=[\"tensorboard\"],\n",
         
     | 
| 968 | 
         
            +
                "    load_best_model_at_end=True,\n",
         
     | 
| 969 | 
         
            +
                "    metric_for_best_model=\"wer\",\n",
         
     | 
| 970 | 
         
            +
                "    greater_is_better=False,\n",
         
     | 
| 971 | 
         
            +
                "    push_to_hub=True,\n",
         
     | 
| 972 | 
         
            +
                ")"
         
     | 
| 973 | 
         
            +
               ]
         
     | 
| 974 | 
         
            +
              },
         
     | 
| 975 | 
         
            +
              {
         
     | 
| 976 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 977 | 
         
            +
               "id": "b3a944d8-3112-4552-82a0-be25988b3857",
         
     | 
| 978 | 
         
            +
               "metadata": {},
         
     | 
| 979 | 
         
            +
               "source": [
         
     | 
| 980 | 
         
            +
                "**Note**: if one does not want to upload the model checkpoints to the Hub, \n",
         
     | 
| 981 | 
         
            +
                "set `push_to_hub=False`."
         
     | 
| 982 | 
         
            +
               ]
         
     | 
| 983 | 
         
            +
              },
         
     | 
| 984 | 
         
            +
              {
         
     | 
| 985 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 986 | 
         
            +
               "id": "393c883e-3e50-492c-bd58-f51dbf15ee56",
         
     | 
| 987 | 
         
            +
               "metadata": {},
         
     | 
| 988 | 
         
            +
               "source": [
         
     | 
| 989 | 
         
            +
                "We then define a custom [Callback](https://huggingface.co/docs/transformers/main_classes/callback) that is called by the 🤗 Trainer on the end of each epoch. The Callback reinitialises and reshuffles the streaming dataset at the beginning of each new epoch - this gives different shuffling across our subsets for every epoch."
         
     | 
| 990 | 
         
            +
               ]
         
     | 
| 991 | 
         
            +
              },
         
     | 
| 992 | 
         
            +
              {
         
     | 
| 993 | 
         
            +
               "cell_type": "code",
         
     | 
| 994 | 
         
            +
               "execution_count": 25,
         
     | 
| 995 | 
         
            +
               "id": "3ac16b62-b3c0-4c68-8f3d-9ecf471534b2",
         
     | 
| 996 | 
         
            +
               "metadata": {},
         
     | 
| 997 | 
         
            +
               "outputs": [],
         
     | 
| 998 | 
         
            +
               "source": [
         
     | 
| 999 | 
         
            +
                "from transformers import TrainerCallback\n",
         
     | 
| 1000 | 
         
            +
                "from transformers.trainer_pt_utils import IterableDatasetShard\n",
         
     | 
| 1001 | 
         
            +
                "from torch.utils.data import IterableDataset\n",
         
     | 
| 1002 | 
         
            +
                "\n",
         
     | 
| 1003 | 
         
            +
                "# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch\n",
         
     | 
| 1004 | 
         
            +
                "class ShuffleCallback(TrainerCallback):\n",
         
     | 
| 1005 | 
         
            +
                "    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):\n",
         
     | 
| 1006 | 
         
            +
                "        if isinstance(train_dataloader.dataset, IterableDatasetShard):\n",
         
     | 
| 1007 | 
         
            +
                "            pass  # set_epoch() is handled by the Trainer\n",
         
     | 
| 1008 | 
         
            +
                "        elif isinstance(train_dataloader.dataset, IterableDataset):\n",
         
     | 
| 1009 | 
         
            +
                "            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)"
         
     | 
| 1010 | 
         
            +
               ]
         
     | 
| 1011 | 
         
            +
              },
         
     | 
| 1012 | 
         
            +
              {
         
     | 
| 1013 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1014 | 
         
            +
               "id": "bac29114-d226-4f54-97cf-8718c9f94e1e",
         
     | 
| 1015 | 
         
            +
               "metadata": {},
         
     | 
| 1016 | 
         
            +
               "source": [
         
     | 
| 1017 | 
         
            +
                "We can forward the training arguments to the 🤗 Trainer along with our model,\n",
         
     | 
| 1018 | 
         
            +
                "dataset, data collator, `compute_metrics` function and custom callback:"
         
     | 
| 1019 | 
         
            +
               ]
         
     | 
| 1020 | 
         
            +
              },
         
     | 
| 1021 | 
         
            +
              {
         
     | 
| 1022 | 
         
            +
               "cell_type": "code",
         
     | 
| 1023 | 
         
            +
               "execution_count": 26,
         
     | 
| 1024 | 
         
            +
               "id": "d546d7fe-0543-479a-b708-2ebabec19493",
         
     | 
| 1025 | 
         
            +
               "metadata": {},
         
     | 
| 1026 | 
         
            +
               "outputs": [
         
     | 
| 1027 | 
         
            +
                {
         
     | 
| 1028 | 
         
            +
                 "name": "stderr",
         
     | 
| 1029 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1030 | 
         
            +
                 "text": [
         
     | 
| 1031 | 
         
            +
                  "/home/ubuntu/whisper-medium-es-common-fleurs-5k-10k/./ is already a clone of https://huggingface.co/juancopi81/whisper-medium-es-common-fleurs-5k-10k. Make sure you pull the latest changes with `repo.git_pull()`.\n",
         
     | 
| 1032 | 
         
            +
                  "max_steps is given, it will override any value given in num_train_epochs\n",
         
     | 
| 1033 | 
         
            +
                  "Using cuda_amp half precision backend\n"
         
     | 
| 1034 | 
         
            +
                 ]
         
     | 
| 1035 | 
         
            +
                }
         
     | 
| 1036 | 
         
            +
               ],
         
     | 
| 1037 | 
         
            +
               "source": [
         
     | 
| 1038 | 
         
            +
                "from transformers import Seq2SeqTrainer\n",
         
     | 
| 1039 | 
         
            +
                "\n",
         
     | 
| 1040 | 
         
            +
                "trainer = Seq2SeqTrainer(\n",
         
     | 
| 1041 | 
         
            +
                "    args=training_args,\n",
         
     | 
| 1042 | 
         
            +
                "    model=model,\n",
         
     | 
| 1043 | 
         
            +
                "    train_dataset=vectorized_datasets[\"train\"],\n",
         
     | 
| 1044 | 
         
            +
                "    eval_dataset=vectorized_datasets[\"test\"],\n",
         
     | 
| 1045 | 
         
            +
                "    data_collator=data_collator,\n",
         
     | 
| 1046 | 
         
            +
                "    compute_metrics=compute_metrics,\n",
         
     | 
| 1047 | 
         
            +
                "    tokenizer=processor,\n",
         
     | 
| 1048 | 
         
            +
                "    callbacks=[ShuffleCallback()],\n",
         
     | 
| 1049 | 
         
            +
                ")"
         
     | 
| 1050 | 
         
            +
               ]
         
     | 
| 1051 | 
         
            +
              },
         
     | 
| 1052 | 
         
            +
              {
         
     | 
| 1053 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1054 | 
         
            +
               "id": "67ab88c3-7091-4e51-8ad5-f5cacbe18449",
         
     | 
| 1055 | 
         
            +
               "metadata": {},
         
     | 
| 1056 | 
         
            +
               "source": [
         
     | 
| 1057 | 
         
            +
                "We'll save the model and processor to the output directory before training:"
         
     | 
| 1058 | 
         
            +
               ]
         
     | 
| 1059 | 
         
            +
              },
         
     | 
| 1060 | 
         
            +
              {
         
     | 
| 1061 | 
         
            +
               "cell_type": "code",
         
     | 
| 1062 | 
         
            +
               "execution_count": 27,
         
     | 
| 1063 | 
         
            +
               "id": "a1ccb9ed-cbc8-4419-91c0-651e9424b672",
         
     | 
| 1064 | 
         
            +
               "metadata": {},
         
     | 
| 1065 | 
         
            +
               "outputs": [
         
     | 
| 1066 | 
         
            +
                {
         
     | 
| 1067 | 
         
            +
                 "name": "stderr",
         
     | 
| 1068 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1069 | 
         
            +
                 "text": [
         
     | 
| 1070 | 
         
            +
                  "Configuration saved in ./config.json\n",
         
     | 
| 1071 | 
         
            +
                  "Model weights saved in ./pytorch_model.bin\n",
         
     | 
| 1072 | 
         
            +
                  "Feature extractor saved in ./preprocessor_config.json\n",
         
     | 
| 1073 | 
         
            +
                  "tokenizer config file saved in ./tokenizer_config.json\n",
         
     | 
| 1074 | 
         
            +
                  "Special tokens file saved in ./special_tokens_map.json\n",
         
     | 
| 1075 | 
         
            +
                  "added tokens file saved in ./added_tokens.json\n"
         
     | 
| 1076 | 
         
            +
                 ]
         
     | 
| 1077 | 
         
            +
                }
         
     | 
| 1078 | 
         
            +
               ],
         
     | 
| 1079 | 
         
            +
               "source": [
         
     | 
| 1080 | 
         
            +
                "model.save_pretrained(training_args.output_dir)\n",
         
     | 
| 1081 | 
         
            +
                "processor.save_pretrained(training_args.output_dir)"
         
     | 
| 1082 | 
         
            +
               ]
         
     | 
| 1083 | 
         
            +
              },
         
     | 
| 1084 | 
         
            +
              {
         
     | 
| 1085 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1086 | 
         
            +
               "id": "7f404cf9-4345-468c-8196-4bd101d9bd51",
         
     | 
| 1087 | 
         
            +
               "metadata": {},
         
     | 
| 1088 | 
         
            +
               "source": [
         
     | 
| 1089 | 
         
            +
                "### Training"
         
     | 
| 1090 | 
         
            +
               ]
         
     | 
| 1091 | 
         
            +
              },
         
     | 
| 1092 | 
         
            +
              {
         
     | 
| 1093 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1094 | 
         
            +
               "id": "5e8b8d56-5a70-4f68-bd2e-f0752d0bd112",
         
     | 
| 1095 | 
         
            +
               "metadata": {},
         
     | 
| 1096 | 
         
            +
               "source": [
         
     | 
| 1097 | 
         
            +
                "Training will take approximately 5-10 hours depending on your GPU. The peak GPU memory for the given training configuration is approximately 36GB. \n",
         
     | 
| 1098 | 
         
            +
                "Depending on your GPU, it is possible that you will encounter a CUDA `\"out-of-memory\"` error when you launch training. \n",
         
     | 
| 1099 | 
         
            +
                "In this case, you can reduce the `per_device_train_batch_size` incrementally by factors of 2 \n",
         
     | 
| 1100 | 
         
            +
                "and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps)\n",
         
     | 
| 1101 | 
         
            +
                "to compensate.\n",
         
     | 
| 1102 | 
         
            +
                "\n",
         
     | 
| 1103 | 
         
            +
                "To launch training, simply execute:"
         
     | 
| 1104 | 
         
            +
               ]
         
     | 
| 1105 | 
         
            +
              },
         
     | 
| 1106 | 
         
            +
              {
         
     | 
| 1107 | 
         
            +
               "cell_type": "code",
         
     | 
| 1108 | 
         
            +
               "execution_count": 28,
         
     | 
| 1109 | 
         
            +
               "id": "ced90915-84df-4538-9034-f6c8c85de2df",
         
     | 
| 1110 | 
         
            +
               "metadata": {},
         
     | 
| 1111 | 
         
            +
               "outputs": [
         
     | 
| 1112 | 
         
            +
                {
         
     | 
| 1113 | 
         
            +
                 "data": {
         
     | 
| 1114 | 
         
            +
                  "application/vnd.jupyter.widget-view+json": {
         
     | 
| 1115 | 
         
            +
                   "model_id": "386d02833fb0467980c51f82505ce44a",
         
     | 
| 1116 | 
         
            +
                   "version_major": 2,
         
     | 
| 1117 | 
         
            +
                   "version_minor": 0
         
     | 
| 1118 | 
         
            +
                  },
         
     | 
| 1119 | 
         
            +
                  "text/plain": [
         
     | 
| 1120 | 
         
            +
                   "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
         
     | 
| 1121 | 
         
            +
                  ]
         
     | 
| 1122 | 
         
            +
                 },
         
     | 
| 1123 | 
         
            +
                 "metadata": {},
         
     | 
| 1124 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 1125 | 
         
            +
                }
         
     | 
| 1126 | 
         
            +
               ],
         
     | 
| 1127 | 
         
            +
               "source": [
         
     | 
| 1128 | 
         
            +
                "from huggingface_hub import notebook_login\n",
         
     | 
| 1129 | 
         
            +
                "\n",
         
     | 
| 1130 | 
         
            +
                "notebook_login()"
         
     | 
| 1131 | 
         
            +
               ]
         
     | 
| 1132 | 
         
            +
              },
         
     | 
| 1133 | 
         
            +
              {
         
     | 
| 1134 | 
         
            +
               "cell_type": "code",
         
     | 
| 1135 | 
         
            +
               "execution_count": null,
         
     | 
| 1136 | 
         
            +
               "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
         
     | 
| 1137 | 
         
            +
               "metadata": {},
         
     | 
| 1138 | 
         
            +
               "outputs": [
         
     | 
| 1139 | 
         
            +
                {
         
     | 
| 1140 | 
         
            +
                 "name": "stderr",
         
     | 
| 1141 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1142 | 
         
            +
                 "text": [
         
     | 
| 1143 | 
         
            +
                  "/home/ubuntu/hf_env/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
         
     | 
| 1144 | 
         
            +
                  "  warnings.warn(\n",
         
     | 
| 1145 | 
         
            +
                  "***** Running training *****\n",
         
     | 
| 1146 | 
         
            +
                  "  Num examples = 160000\n",
         
     | 
| 1147 | 
         
            +
                  "  Num Epochs = 9223372036854775807\n",
         
     | 
| 1148 | 
         
            +
                  "  Instantaneous batch size per device = 32\n",
         
     | 
| 1149 | 
         
            +
                  "  Total train batch size (w. parallel, distributed & accumulation) = 32\n",
         
     | 
| 1150 | 
         
            +
                  "  Gradient Accumulation steps = 1\n",
         
     | 
| 1151 | 
         
            +
                  "  Total optimization steps = 5000\n",
         
     | 
| 1152 | 
         
            +
                  "  Number of trainable parameters = 763857920\n",
         
     | 
| 1153 | 
         
            +
                  "Reading metadata...: 230467it [00:05, 39424.34it/s]\n",
         
     | 
| 1154 | 
         
            +
                  "The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
         
     | 
| 1155 | 
         
            +
                 ]
         
     | 
| 1156 | 
         
            +
                },
         
     | 
| 1157 | 
         
            +
                {
         
     | 
| 1158 | 
         
            +
                 "data": {
         
     | 
| 1159 | 
         
            +
                  "text/html": [
         
     | 
| 1160 | 
         
            +
                   "\n",
         
     | 
| 1161 | 
         
            +
                   "    <div>\n",
         
     | 
| 1162 | 
         
            +
                   "      \n",
         
     | 
| 1163 | 
         
            +
                   "      <progress value='1001' max='5000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
         
     | 
| 1164 | 
         
            +
                   "      [1001/5000 1:45:40 < 7:03:01, 0.16 it/s, Epoch 0.20/9223372036854775807]\n",
         
     | 
| 1165 | 
         
            +
                   "    </div>\n",
         
     | 
| 1166 | 
         
            +
                   "    <table border=\"1\" class=\"dataframe\">\n",
         
     | 
| 1167 | 
         
            +
                   "  <thead>\n",
         
     | 
| 1168 | 
         
            +
                   " <tr style=\"text-align: left;\">\n",
         
     | 
| 1169 | 
         
            +
                   "      <th>Step</th>\n",
         
     | 
| 1170 | 
         
            +
                   "      <th>Training Loss</th>\n",
         
     | 
| 1171 | 
         
            +
                   "      <th>Validation Loss</th>\n",
         
     | 
| 1172 | 
         
            +
                   "    </tr>\n",
         
     | 
| 1173 | 
         
            +
                   "  </thead>\n",
         
     | 
| 1174 | 
         
            +
                   "  <tbody>\n",
         
     | 
| 1175 | 
         
            +
                   "  </tbody>\n",
         
     | 
| 1176 | 
         
            +
                   "</table><p>"
         
     | 
| 1177 | 
         
            +
                  ],
         
     | 
| 1178 | 
         
            +
                  "text/plain": [
         
     | 
| 1179 | 
         
            +
                   "<IPython.core.display.HTML object>"
         
     | 
| 1180 | 
         
            +
                  ]
         
     | 
| 1181 | 
         
            +
                 },
         
     | 
| 1182 | 
         
            +
                 "metadata": {},
         
     | 
| 1183 | 
         
            +
                 "output_type": "display_data"
         
     | 
| 1184 | 
         
            +
                },
         
     | 
| 1185 | 
         
            +
                {
         
     | 
| 1186 | 
         
            +
                 "name": "stderr",
         
     | 
| 1187 | 
         
            +
                 "output_type": "stream",
         
     | 
| 1188 | 
         
            +
                 "text": [
         
     | 
| 1189 | 
         
            +
                  "***** Running Evaluation *****\n",
         
     | 
| 1190 | 
         
            +
                  "  Num examples: Unknown\n",
         
     | 
| 1191 | 
         
            +
                  "  Batch size = 16\n",
         
     | 
| 1192 | 
         
            +
                  "Reading metadata...: 15520it [00:00, 83747.62it/s]\n",
         
     | 
| 1193 | 
         
            +
                  "The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age. If up_votes, client_id, down_votes, gender, accent, segment, path, locale, input_length, age are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.\n"
         
     | 
| 1194 | 
         
            +
                 ]
         
     | 
| 1195 | 
         
            +
                }
         
     | 
| 1196 | 
         
            +
               ],
         
     | 
| 1197 | 
         
            +
               "source": [
         
     | 
| 1198 | 
         
            +
                "trainer.train()"
         
     | 
| 1199 | 
         
            +
               ]
         
     | 
| 1200 | 
         
            +
              },
         
     | 
| 1201 | 
         
            +
              {
         
     | 
| 1202 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1203 | 
         
            +
               "id": "747c6a6e",
         
     | 
| 1204 | 
         
            +
               "metadata": {
         
     | 
| 1205 | 
         
            +
                "pycharm": {
         
     | 
| 1206 | 
         
            +
                 "name": "#%% md\n"
         
     | 
| 1207 | 
         
            +
                }
         
     | 
| 1208 | 
         
            +
               },
         
     | 
| 1209 | 
         
            +
               "source": [
         
     | 
| 1210 | 
         
            +
                "(note that training may take some time to commence as we load the first training data samples with streaming mode)"
         
     | 
| 1211 | 
         
            +
               ]
         
     | 
| 1212 | 
         
            +
              },
         
     | 
| 1213 | 
         
            +
              {
         
     | 
| 1214 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1215 | 
         
            +
               "id": "810ced54-7187-4a06-b2fe-ba6dcca94dc3",
         
     | 
| 1216 | 
         
            +
               "metadata": {},
         
     | 
| 1217 | 
         
            +
               "source": [
         
     | 
| 1218 | 
         
            +
                "We can label our checkpoint with the `whisper-event` tag on push by setting the appropriate key-word arguments (kwargs):"
         
     | 
| 1219 | 
         
            +
               ]
         
     | 
| 1220 | 
         
            +
              },
         
     | 
| 1221 | 
         
            +
              {
         
     | 
| 1222 | 
         
            +
               "cell_type": "code",
         
     | 
| 1223 | 
         
            +
               "execution_count": null,
         
     | 
| 1224 | 
         
            +
               "id": "6dd0e310-9b07-4133-ac14-2ed2d7524e22",
         
     | 
| 1225 | 
         
            +
               "metadata": {},
         
     | 
| 1226 | 
         
            +
               "outputs": [],
         
     | 
| 1227 | 
         
            +
               "source": [
         
     | 
| 1228 | 
         
            +
                "kwargs = {\n",
         
     | 
| 1229 | 
         
            +
                "    \"dataset_tags\": \"mozilla-foundation/common_voice_11_0\",\n",
         
     | 
| 1230 | 
         
            +
                "    \"dataset\": \"Common Voice 11.0\",  # a 'pretty' name for the training dataset\n",
         
     | 
| 1231 | 
         
            +
                "    \"language\": \"es\",\n",
         
     | 
| 1232 | 
         
            +
                "    \"model_name\": \"Whisper Mediuem Es - Juan Pineros\",  # a 'pretty' name for your model\n",
         
     | 
| 1233 | 
         
            +
                "    \"finetuned_from\": \"openai/whisper-small\",\n",
         
     | 
| 1234 | 
         
            +
                "    \"tasks\": \"automatic-speech-recognition\",\n",
         
     | 
| 1235 | 
         
            +
                "    \"tags\": \"whisper-event\",\n",
         
     | 
| 1236 | 
         
            +
                "}"
         
     | 
| 1237 | 
         
            +
               ]
         
     | 
| 1238 | 
         
            +
              },
         
     | 
| 1239 | 
         
            +
              {
         
     | 
| 1240 | 
         
            +
               "cell_type": "markdown",
         
     | 
| 1241 | 
         
            +
               "id": "090d676a-f944-4297-a938-a40eda0b2b68",
         
     | 
| 1242 | 
         
            +
               "metadata": {},
         
     | 
| 1243 | 
         
            +
               "source": [
         
     | 
| 1244 | 
         
            +
                "The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command:"
         
     | 
| 1245 | 
         
            +
               ]
         
     | 
| 1246 | 
         
            +
              },
         
     | 
| 1247 | 
         
            +
              {
         
     | 
| 1248 | 
         
            +
               "cell_type": "code",
         
     | 
| 1249 | 
         
            +
               "execution_count": null,
         
     | 
| 1250 | 
         
            +
               "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
         
     | 
| 1251 | 
         
            +
               "metadata": {},
         
     | 
| 1252 | 
         
            +
               "outputs": [],
         
     | 
| 1253 | 
         
            +
               "source": [
         
     | 
| 1254 | 
         
            +
                "trainer.push_to_hub(**kwargs)"
         
     | 
| 1255 | 
         
            +
               ]
         
     | 
| 1256 | 
         
            +
              },
         
     | 
| 1257 | 
         
            +
              {
         
     | 
| 1258 | 
         
            +
               "cell_type": "code",
         
     | 
| 1259 | 
         
            +
               "execution_count": null,
         
     | 
| 1260 | 
         
            +
               "id": "29e716f8-7386-4c8f-a35a-4f682ec24eb0",
         
     | 
| 1261 | 
         
            +
               "metadata": {},
         
     | 
| 1262 | 
         
            +
               "outputs": [],
         
     | 
| 1263 | 
         
            +
               "source": []
         
     | 
| 1264 | 
         
            +
              }
         
     | 
| 1265 | 
         
            +
             ],
         
     | 
| 1266 | 
         
            +
             "metadata": {
         
     | 
| 1267 | 
         
            +
              "kernelspec": {
         
     | 
| 1268 | 
         
            +
               "display_name": "hf_env",
         
     | 
| 1269 | 
         
            +
               "language": "python",
         
     | 
| 1270 | 
         
            +
               "name": "hf_env"
         
     | 
| 1271 | 
         
            +
              },
         
     | 
| 1272 | 
         
            +
              "language_info": {
         
     | 
| 1273 | 
         
            +
               "codemirror_mode": {
         
     | 
| 1274 | 
         
            +
                "name": "ipython",
         
     | 
| 1275 | 
         
            +
                "version": 3
         
     | 
| 1276 | 
         
            +
               },
         
     | 
| 1277 | 
         
            +
               "file_extension": ".py",
         
     | 
| 1278 | 
         
            +
               "mimetype": "text/x-python",
         
     | 
| 1279 | 
         
            +
               "name": "python",
         
     | 
| 1280 | 
         
            +
               "nbconvert_exporter": "python",
         
     | 
| 1281 | 
         
            +
               "pygments_lexer": "ipython3",
         
     | 
| 1282 | 
         
            +
               "version": "3.8.10"
         
     | 
| 1283 | 
         
            +
              }
         
     | 
| 1284 | 
         
            +
             },
         
     | 
| 1285 | 
         
            +
             "nbformat": 4,
         
     | 
| 1286 | 
         
            +
             "nbformat_minor": 5
         
     | 
| 1287 | 
         
            +
            }
         
     | 
    	
        merges.txt
    ADDED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         | 
    	
        normalizer.json
    ADDED
    
    | 
         @@ -0,0 +1,1742 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "accessorise": "accessorize",
         
     | 
| 3 | 
         
            +
              "accessorised": "accessorized",
         
     | 
| 4 | 
         
            +
              "accessorises": "accessorizes",
         
     | 
| 5 | 
         
            +
              "accessorising": "accessorizing",
         
     | 
| 6 | 
         
            +
              "acclimatisation": "acclimatization",
         
     | 
| 7 | 
         
            +
              "acclimatise": "acclimatize",
         
     | 
| 8 | 
         
            +
              "acclimatised": "acclimatized",
         
     | 
| 9 | 
         
            +
              "acclimatises": "acclimatizes",
         
     | 
| 10 | 
         
            +
              "acclimatising": "acclimatizing",
         
     | 
| 11 | 
         
            +
              "accoutrements": "accouterments",
         
     | 
| 12 | 
         
            +
              "aeon": "eon",
         
     | 
| 13 | 
         
            +
              "aeons": "eons",
         
     | 
| 14 | 
         
            +
              "aerogramme": "aerogram",
         
     | 
| 15 | 
         
            +
              "aerogrammes": "aerograms",
         
     | 
| 16 | 
         
            +
              "aeroplane": "airplane",
         
     | 
| 17 | 
         
            +
              "aeroplanes": "airplanes",
         
     | 
| 18 | 
         
            +
              "aesthete": "esthete",
         
     | 
| 19 | 
         
            +
              "aesthetes": "esthetes",
         
     | 
| 20 | 
         
            +
              "aesthetic": "esthetic",
         
     | 
| 21 | 
         
            +
              "aesthetically": "esthetically",
         
     | 
| 22 | 
         
            +
              "aesthetics": "esthetics",
         
     | 
| 23 | 
         
            +
              "aetiology": "etiology",
         
     | 
| 24 | 
         
            +
              "ageing": "aging",
         
     | 
| 25 | 
         
            +
              "aggrandisement": "aggrandizement",
         
     | 
| 26 | 
         
            +
              "agonise": "agonize",
         
     | 
| 27 | 
         
            +
              "agonised": "agonized",
         
     | 
| 28 | 
         
            +
              "agonises": "agonizes",
         
     | 
| 29 | 
         
            +
              "agonising": "agonizing",
         
     | 
| 30 | 
         
            +
              "agonisingly": "agonizingly",
         
     | 
| 31 | 
         
            +
              "almanack": "almanac",
         
     | 
| 32 | 
         
            +
              "almanacks": "almanacs",
         
     | 
| 33 | 
         
            +
              "aluminium": "aluminum",
         
     | 
| 34 | 
         
            +
              "amortisable": "amortizable",
         
     | 
| 35 | 
         
            +
              "amortisation": "amortization",
         
     | 
| 36 | 
         
            +
              "amortisations": "amortizations",
         
     | 
| 37 | 
         
            +
              "amortise": "amortize",
         
     | 
| 38 | 
         
            +
              "amortised": "amortized",
         
     | 
| 39 | 
         
            +
              "amortises": "amortizes",
         
     | 
| 40 | 
         
            +
              "amortising": "amortizing",
         
     | 
| 41 | 
         
            +
              "amphitheatre": "amphitheater",
         
     | 
| 42 | 
         
            +
              "amphitheatres": "amphitheaters",
         
     | 
| 43 | 
         
            +
              "anaemia": "anemia",
         
     | 
| 44 | 
         
            +
              "anaemic": "anemic",
         
     | 
| 45 | 
         
            +
              "anaesthesia": "anesthesia",
         
     | 
| 46 | 
         
            +
              "anaesthetic": "anesthetic",
         
     | 
| 47 | 
         
            +
              "anaesthetics": "anesthetics",
         
     | 
| 48 | 
         
            +
              "anaesthetise": "anesthetize",
         
     | 
| 49 | 
         
            +
              "anaesthetised": "anesthetized",
         
     | 
| 50 | 
         
            +
              "anaesthetises": "anesthetizes",
         
     | 
| 51 | 
         
            +
              "anaesthetising": "anesthetizing",
         
     | 
| 52 | 
         
            +
              "anaesthetist": "anesthetist",
         
     | 
| 53 | 
         
            +
              "anaesthetists": "anesthetists",
         
     | 
| 54 | 
         
            +
              "anaesthetize": "anesthetize",
         
     | 
| 55 | 
         
            +
              "anaesthetized": "anesthetized",
         
     | 
| 56 | 
         
            +
              "anaesthetizes": "anesthetizes",
         
     | 
| 57 | 
         
            +
              "anaesthetizing": "anesthetizing",
         
     | 
| 58 | 
         
            +
              "analogue": "analog",
         
     | 
| 59 | 
         
            +
              "analogues": "analogs",
         
     | 
| 60 | 
         
            +
              "analyse": "analyze",
         
     | 
| 61 | 
         
            +
              "analysed": "analyzed",
         
     | 
| 62 | 
         
            +
              "analyses": "analyzes",
         
     | 
| 63 | 
         
            +
              "analysing": "analyzing",
         
     | 
| 64 | 
         
            +
              "anglicise": "anglicize",
         
     | 
| 65 | 
         
            +
              "anglicised": "anglicized",
         
     | 
| 66 | 
         
            +
              "anglicises": "anglicizes",
         
     | 
| 67 | 
         
            +
              "anglicising": "anglicizing",
         
     | 
| 68 | 
         
            +
              "annualised": "annualized",
         
     | 
| 69 | 
         
            +
              "antagonise": "antagonize",
         
     | 
| 70 | 
         
            +
              "antagonised": "antagonized",
         
     | 
| 71 | 
         
            +
              "antagonises": "antagonizes",
         
     | 
| 72 | 
         
            +
              "antagonising": "antagonizing",
         
     | 
| 73 | 
         
            +
              "apologise": "apologize",
         
     | 
| 74 | 
         
            +
              "apologised": "apologized",
         
     | 
| 75 | 
         
            +
              "apologises": "apologizes",
         
     | 
| 76 | 
         
            +
              "apologising": "apologizing",
         
     | 
| 77 | 
         
            +
              "appal": "appall",
         
     | 
| 78 | 
         
            +
              "appals": "appalls",
         
     | 
| 79 | 
         
            +
              "appetiser": "appetizer",
         
     | 
| 80 | 
         
            +
              "appetisers": "appetizers",
         
     | 
| 81 | 
         
            +
              "appetising": "appetizing",
         
     | 
| 82 | 
         
            +
              "appetisingly": "appetizingly",
         
     | 
| 83 | 
         
            +
              "arbour": "arbor",
         
     | 
| 84 | 
         
            +
              "arbours": "arbors",
         
     | 
| 85 | 
         
            +
              "archaeologically": "archeologically",
         
     | 
| 86 | 
         
            +
              "archaeologist": "archeologist",
         
     | 
| 87 | 
         
            +
              "archaeologists": "archeologists",
         
     | 
| 88 | 
         
            +
              "archaeology": "archeology</span>",
         
     | 
| 89 | 
         
            +
              "archeological": "archaeological",
         
     | 
| 90 | 
         
            +
              "ardour": "ardor",
         
     | 
| 91 | 
         
            +
              "armour": "armor",
         
     | 
| 92 | 
         
            +
              "armoured": "armored",
         
     | 
| 93 | 
         
            +
              "armourer": "armorer",
         
     | 
| 94 | 
         
            +
              "armourers": "armorers",
         
     | 
| 95 | 
         
            +
              "armouries": "armories",
         
     | 
| 96 | 
         
            +
              "armoury": "armory",
         
     | 
| 97 | 
         
            +
              "artefact": "artifact",
         
     | 
| 98 | 
         
            +
              "artefacts": "artifacts",
         
     | 
| 99 | 
         
            +
              "authorise": "authorize",
         
     | 
| 100 | 
         
            +
              "authorised": "authorized",
         
     | 
| 101 | 
         
            +
              "authorises": "authorizes",
         
     | 
| 102 | 
         
            +
              "authorising": "authorizing",
         
     | 
| 103 | 
         
            +
              "axe": "ax",
         
     | 
| 104 | 
         
            +
              "backpedalled": "backpedaled",
         
     | 
| 105 | 
         
            +
              "backpedalling": "backpedaling",
         
     | 
| 106 | 
         
            +
              "bannister": "banister",
         
     | 
| 107 | 
         
            +
              "bannisters": "banisters",
         
     | 
| 108 | 
         
            +
              "baptise": "baptize",
         
     | 
| 109 | 
         
            +
              "baptised": "baptized",
         
     | 
| 110 | 
         
            +
              "baptises": "baptizes",
         
     | 
| 111 | 
         
            +
              "baptising": "baptizing",
         
     | 
| 112 | 
         
            +
              "bastardise": "bastardize",
         
     | 
| 113 | 
         
            +
              "bastardised": "bastardized",
         
     | 
| 114 | 
         
            +
              "bastardises": "bastardizes",
         
     | 
| 115 | 
         
            +
              "bastardising": "bastardizing",
         
     | 
| 116 | 
         
            +
              "battleax": "battleaxe",
         
     | 
| 117 | 
         
            +
              "baulk": "balk",
         
     | 
| 118 | 
         
            +
              "baulked": "balked",
         
     | 
| 119 | 
         
            +
              "baulking": "balking",
         
     | 
| 120 | 
         
            +
              "baulks": "balks",
         
     | 
| 121 | 
         
            +
              "bedevilled": "bedeviled",
         
     | 
| 122 | 
         
            +
              "bedevilling": "bedeviling",
         
     | 
| 123 | 
         
            +
              "behaviour": "behavior",
         
     | 
| 124 | 
         
            +
              "behavioural": "behavioral",
         
     | 
| 125 | 
         
            +
              "behaviourism": "behaviorism",
         
     | 
| 126 | 
         
            +
              "behaviourist": "behaviorist",
         
     | 
| 127 | 
         
            +
              "behaviourists": "behaviorists",
         
     | 
| 128 | 
         
            +
              "behaviours": "behaviors",
         
     | 
| 129 | 
         
            +
              "behove": "behoove",
         
     | 
| 130 | 
         
            +
              "behoved": "behooved",
         
     | 
| 131 | 
         
            +
              "behoves": "behooves",
         
     | 
| 132 | 
         
            +
              "bejewelled": "bejeweled",
         
     | 
| 133 | 
         
            +
              "belabour": "belabor",
         
     | 
| 134 | 
         
            +
              "belaboured": "belabored",
         
     | 
| 135 | 
         
            +
              "belabouring": "belaboring",
         
     | 
| 136 | 
         
            +
              "belabours": "belabors",
         
     | 
| 137 | 
         
            +
              "bevelled": "beveled",
         
     | 
| 138 | 
         
            +
              "bevvies": "bevies",
         
     | 
| 139 | 
         
            +
              "bevvy": "bevy",
         
     | 
| 140 | 
         
            +
              "biassed": "biased",
         
     | 
| 141 | 
         
            +
              "biassing": "biasing",
         
     | 
| 142 | 
         
            +
              "bingeing": "binging",
         
     | 
| 143 | 
         
            +
              "bougainvillaea": "bougainvillea",
         
     | 
| 144 | 
         
            +
              "bougainvillaeas": "bougainvilleas",
         
     | 
| 145 | 
         
            +
              "bowdlerise": "bowdlerize",
         
     | 
| 146 | 
         
            +
              "bowdlerised": "bowdlerized",
         
     | 
| 147 | 
         
            +
              "bowdlerises": "bowdlerizes",
         
     | 
| 148 | 
         
            +
              "bowdlerising": "bowdlerizing",
         
     | 
| 149 | 
         
            +
              "breathalyse": "breathalyze",
         
     | 
| 150 | 
         
            +
              "breathalysed": "breathalyzed",
         
     | 
| 151 | 
         
            +
              "breathalyser": "breathalyzer",
         
     | 
| 152 | 
         
            +
              "breathalysers": "breathalyzers",
         
     | 
| 153 | 
         
            +
              "breathalyses": "breathalyzes",
         
     | 
| 154 | 
         
            +
              "breathalysing": "breathalyzing",
         
     | 
| 155 | 
         
            +
              "brutalise": "brutalize",
         
     | 
| 156 | 
         
            +
              "brutalised": "brutalized",
         
     | 
| 157 | 
         
            +
              "brutalises": "brutalizes",
         
     | 
| 158 | 
         
            +
              "brutalising": "brutalizing",
         
     | 
| 159 | 
         
            +
              "busses": "buses",
         
     | 
| 160 | 
         
            +
              "bussing": "busing",
         
     | 
| 161 | 
         
            +
              "caesarean": "cesarean",
         
     | 
| 162 | 
         
            +
              "caesareans": "cesareans",
         
     | 
| 163 | 
         
            +
              "calibre": "caliber",
         
     | 
| 164 | 
         
            +
              "calibres": "calibers",
         
     | 
| 165 | 
         
            +
              "calliper": "caliper",
         
     | 
| 166 | 
         
            +
              "callipers": "calipers",
         
     | 
| 167 | 
         
            +
              "callisthenics": "calisthenics",
         
     | 
| 168 | 
         
            +
              "canalise": "canalize",
         
     | 
| 169 | 
         
            +
              "canalised": "canalized",
         
     | 
| 170 | 
         
            +
              "canalises": "canalizes",
         
     | 
| 171 | 
         
            +
              "canalising": "canalizing",
         
     | 
| 172 | 
         
            +
              "cancelation": "cancellation",
         
     | 
| 173 | 
         
            +
              "cancelations": "cancellations",
         
     | 
| 174 | 
         
            +
              "cancelled": "canceled",
         
     | 
| 175 | 
         
            +
              "cancelling": "canceling",
         
     | 
| 176 | 
         
            +
              "candour": "candor",
         
     | 
| 177 | 
         
            +
              "cannibalise": "cannibalize",
         
     | 
| 178 | 
         
            +
              "cannibalised": "cannibalized",
         
     | 
| 179 | 
         
            +
              "cannibalises": "cannibalizes",
         
     | 
| 180 | 
         
            +
              "cannibalising": "cannibalizing",
         
     | 
| 181 | 
         
            +
              "canonise": "canonize",
         
     | 
| 182 | 
         
            +
              "canonised": "canonized",
         
     | 
| 183 | 
         
            +
              "canonises": "canonizes",
         
     | 
| 184 | 
         
            +
              "canonising": "canonizing",
         
     | 
| 185 | 
         
            +
              "capitalise": "capitalize",
         
     | 
| 186 | 
         
            +
              "capitalised": "capitalized",
         
     | 
| 187 | 
         
            +
              "capitalises": "capitalizes",
         
     | 
| 188 | 
         
            +
              "capitalising": "capitalizing",
         
     | 
| 189 | 
         
            +
              "caramelise": "caramelize",
         
     | 
| 190 | 
         
            +
              "caramelised": "caramelized",
         
     | 
| 191 | 
         
            +
              "caramelises": "caramelizes",
         
     | 
| 192 | 
         
            +
              "caramelising": "caramelizing",
         
     | 
| 193 | 
         
            +
              "carbonise": "carbonize",
         
     | 
| 194 | 
         
            +
              "carbonised": "carbonized",
         
     | 
| 195 | 
         
            +
              "carbonises": "carbonizes",
         
     | 
| 196 | 
         
            +
              "carbonising": "carbonizing",
         
     | 
| 197 | 
         
            +
              "carolled": "caroled",
         
     | 
| 198 | 
         
            +
              "carolling": "caroling",
         
     | 
| 199 | 
         
            +
              "catalogue": "catalog",
         
     | 
| 200 | 
         
            +
              "catalogued": "cataloged",
         
     | 
| 201 | 
         
            +
              "catalogues": "catalogs",
         
     | 
| 202 | 
         
            +
              "cataloguing": "cataloging",
         
     | 
| 203 | 
         
            +
              "catalyse": "catalyze",
         
     | 
| 204 | 
         
            +
              "catalysed": "catalyzed",
         
     | 
| 205 | 
         
            +
              "catalyses": "catalyzes",
         
     | 
| 206 | 
         
            +
              "catalysing": "catalyzing",
         
     | 
| 207 | 
         
            +
              "categorise": "categorize",
         
     | 
| 208 | 
         
            +
              "categorised": "categorized",
         
     | 
| 209 | 
         
            +
              "categorises": "categorizes",
         
     | 
| 210 | 
         
            +
              "categorising": "categorizing",
         
     | 
| 211 | 
         
            +
              "cauterise": "cauterize",
         
     | 
| 212 | 
         
            +
              "cauterised": "cauterized",
         
     | 
| 213 | 
         
            +
              "cauterises": "cauterizes",
         
     | 
| 214 | 
         
            +
              "cauterising": "cauterizing",
         
     | 
| 215 | 
         
            +
              "cavilled": "caviled",
         
     | 
| 216 | 
         
            +
              "cavilling": "caviling",
         
     | 
| 217 | 
         
            +
              "centigramme": "centigram",
         
     | 
| 218 | 
         
            +
              "centigrammes": "centigrams",
         
     | 
| 219 | 
         
            +
              "centilitre": "centiliter",
         
     | 
| 220 | 
         
            +
              "centilitres": "centiliters",
         
     | 
| 221 | 
         
            +
              "centimetre": "centimeter",
         
     | 
| 222 | 
         
            +
              "centimetres": "centimeters",
         
     | 
| 223 | 
         
            +
              "centralise": "centralize",
         
     | 
| 224 | 
         
            +
              "centralised": "centralized",
         
     | 
| 225 | 
         
            +
              "centralises": "centralizes",
         
     | 
| 226 | 
         
            +
              "centralising": "centralizing",
         
     | 
| 227 | 
         
            +
              "centre": "center",
         
     | 
| 228 | 
         
            +
              "centred": "centered",
         
     | 
| 229 | 
         
            +
              "centrefold": "centerfold",
         
     | 
| 230 | 
         
            +
              "centrefolds": "centerfolds",
         
     | 
| 231 | 
         
            +
              "centrepiece": "centerpiece",
         
     | 
| 232 | 
         
            +
              "centrepieces": "centerpieces",
         
     | 
| 233 | 
         
            +
              "centres": "centers",
         
     | 
| 234 | 
         
            +
              "channelled": "channeled",
         
     | 
| 235 | 
         
            +
              "channelling": "channeling",
         
     | 
| 236 | 
         
            +
              "characterise": "characterize",
         
     | 
| 237 | 
         
            +
              "characterised": "characterized",
         
     | 
| 238 | 
         
            +
              "characterises": "characterizes",
         
     | 
| 239 | 
         
            +
              "characterising": "characterizing",
         
     | 
| 240 | 
         
            +
              "cheque": "check",
         
     | 
| 241 | 
         
            +
              "chequebook": "checkbook",
         
     | 
| 242 | 
         
            +
              "chequebooks": "checkbooks",
         
     | 
| 243 | 
         
            +
              "chequered": "checkered",
         
     | 
| 244 | 
         
            +
              "cheques": "checks",
         
     | 
| 245 | 
         
            +
              "chilli": "chili",
         
     | 
| 246 | 
         
            +
              "chimaera": "chimera",
         
     | 
| 247 | 
         
            +
              "chimaeras": "chimeras",
         
     | 
| 248 | 
         
            +
              "chiselled": "chiseled",
         
     | 
| 249 | 
         
            +
              "chiselling": "chiseling",
         
     | 
| 250 | 
         
            +
              "circularise": "circularize",
         
     | 
| 251 | 
         
            +
              "circularised": "circularized",
         
     | 
| 252 | 
         
            +
              "circularises": "circularizes",
         
     | 
| 253 | 
         
            +
              "circularising": "circularizing",
         
     | 
| 254 | 
         
            +
              "civilise": "civilize",
         
     | 
| 255 | 
         
            +
              "civilised": "civilized",
         
     | 
| 256 | 
         
            +
              "civilises": "civilizes",
         
     | 
| 257 | 
         
            +
              "civilising": "civilizing",
         
     | 
| 258 | 
         
            +
              "clamour": "clamor",
         
     | 
| 259 | 
         
            +
              "clamoured": "clamored",
         
     | 
| 260 | 
         
            +
              "clamouring": "clamoring",
         
     | 
| 261 | 
         
            +
              "clamours": "clamors",
         
     | 
| 262 | 
         
            +
              "clangour": "clangor",
         
     | 
| 263 | 
         
            +
              "clarinettist": "clarinetist",
         
     | 
| 264 | 
         
            +
              "clarinettists": "clarinetists",
         
     | 
| 265 | 
         
            +
              "collectivise": "collectivize",
         
     | 
| 266 | 
         
            +
              "collectivised": "collectivized",
         
     | 
| 267 | 
         
            +
              "collectivises": "collectivizes",
         
     | 
| 268 | 
         
            +
              "collectivising": "collectivizing",
         
     | 
| 269 | 
         
            +
              "colonisation": "colonization",
         
     | 
| 270 | 
         
            +
              "colonise": "colonize",
         
     | 
| 271 | 
         
            +
              "colonised": "colonized",
         
     | 
| 272 | 
         
            +
              "coloniser": "colonizer",
         
     | 
| 273 | 
         
            +
              "colonisers": "colonizers",
         
     | 
| 274 | 
         
            +
              "colonises": "colonizes",
         
     | 
| 275 | 
         
            +
              "colonising": "colonizing",
         
     | 
| 276 | 
         
            +
              "colour": "color",
         
     | 
| 277 | 
         
            +
              "colourant": "colorant",
         
     | 
| 278 | 
         
            +
              "colourants": "colorants",
         
     | 
| 279 | 
         
            +
              "coloured": "colored",
         
     | 
| 280 | 
         
            +
              "coloureds": "coloreds",
         
     | 
| 281 | 
         
            +
              "colourful": "colorful",
         
     | 
| 282 | 
         
            +
              "colourfully": "colorfully",
         
     | 
| 283 | 
         
            +
              "colouring": "coloring",
         
     | 
| 284 | 
         
            +
              "colourize": "colorize",
         
     | 
| 285 | 
         
            +
              "colourized": "colorized",
         
     | 
| 286 | 
         
            +
              "colourizes": "colorizes",
         
     | 
| 287 | 
         
            +
              "colourizing": "colorizing",
         
     | 
| 288 | 
         
            +
              "colourless": "colorless",
         
     | 
| 289 | 
         
            +
              "colours": "colors",
         
     | 
| 290 | 
         
            +
              "commercialise": "commercialize",
         
     | 
| 291 | 
         
            +
              "commercialised": "commercialized",
         
     | 
| 292 | 
         
            +
              "commercialises": "commercializes",
         
     | 
| 293 | 
         
            +
              "commercialising": "commercializing",
         
     | 
| 294 | 
         
            +
              "compartmentalise": "compartmentalize",
         
     | 
| 295 | 
         
            +
              "compartmentalised": "compartmentalized",
         
     | 
| 296 | 
         
            +
              "compartmentalises": "compartmentalizes",
         
     | 
| 297 | 
         
            +
              "compartmentalising": "compartmentalizing",
         
     | 
| 298 | 
         
            +
              "computerise": "computerize",
         
     | 
| 299 | 
         
            +
              "computerised": "computerized",
         
     | 
| 300 | 
         
            +
              "computerises": "computerizes",
         
     | 
| 301 | 
         
            +
              "computerising": "computerizing",
         
     | 
| 302 | 
         
            +
              "conceptualise": "conceptualize",
         
     | 
| 303 | 
         
            +
              "conceptualised": "conceptualized",
         
     | 
| 304 | 
         
            +
              "conceptualises": "conceptualizes",
         
     | 
| 305 | 
         
            +
              "conceptualising": "conceptualizing",
         
     | 
| 306 | 
         
            +
              "connexion": "connection",
         
     | 
| 307 | 
         
            +
              "connexions": "connections",
         
     | 
| 308 | 
         
            +
              "contextualise": "contextualize",
         
     | 
| 309 | 
         
            +
              "contextualised": "contextualized",
         
     | 
| 310 | 
         
            +
              "contextualises": "contextualizes",
         
     | 
| 311 | 
         
            +
              "contextualising": "contextualizing",
         
     | 
| 312 | 
         
            +
              "cosier": "cozier",
         
     | 
| 313 | 
         
            +
              "cosies": "cozies",
         
     | 
| 314 | 
         
            +
              "cosiest": "coziest",
         
     | 
| 315 | 
         
            +
              "cosily": "cozily",
         
     | 
| 316 | 
         
            +
              "cosiness": "coziness",
         
     | 
| 317 | 
         
            +
              "cosy": "cozy",
         
     | 
| 318 | 
         
            +
              "councillor": "councilor",
         
     | 
| 319 | 
         
            +
              "councillors": "councilors",
         
     | 
| 320 | 
         
            +
              "counselled": "counseled",
         
     | 
| 321 | 
         
            +
              "counselling": "counseling",
         
     | 
| 322 | 
         
            +
              "counsellor": "counselor",
         
     | 
| 323 | 
         
            +
              "counsellors": "counselors",
         
     | 
| 324 | 
         
            +
              "crenelated": "crenellated",
         
     | 
| 325 | 
         
            +
              "criminalise": "criminalize",
         
     | 
| 326 | 
         
            +
              "criminalised": "criminalized",
         
     | 
| 327 | 
         
            +
              "criminalises": "criminalizes",
         
     | 
| 328 | 
         
            +
              "criminalising": "criminalizing",
         
     | 
| 329 | 
         
            +
              "criticise": "criticize",
         
     | 
| 330 | 
         
            +
              "criticised": "criticized",
         
     | 
| 331 | 
         
            +
              "criticises": "criticizes",
         
     | 
| 332 | 
         
            +
              "criticising": "criticizing",
         
     | 
| 333 | 
         
            +
              "crueller": "crueler",
         
     | 
| 334 | 
         
            +
              "cruellest": "cruelest",
         
     | 
| 335 | 
         
            +
              "crystallisation": "crystallization",
         
     | 
| 336 | 
         
            +
              "crystallise": "crystallize",
         
     | 
| 337 | 
         
            +
              "crystallised": "crystallized",
         
     | 
| 338 | 
         
            +
              "crystallises": "crystallizes",
         
     | 
| 339 | 
         
            +
              "crystallising": "crystallizing",
         
     | 
| 340 | 
         
            +
              "cudgelled": "cudgeled",
         
     | 
| 341 | 
         
            +
              "cudgelling": "cudgeling",
         
     | 
| 342 | 
         
            +
              "customise": "customize",
         
     | 
| 343 | 
         
            +
              "customised": "customized",
         
     | 
| 344 | 
         
            +
              "customises": "customizes",
         
     | 
| 345 | 
         
            +
              "customising": "customizing",
         
     | 
| 346 | 
         
            +
              "cypher": "cipher",
         
     | 
| 347 | 
         
            +
              "cyphers": "ciphers",
         
     | 
| 348 | 
         
            +
              "decentralisation": "decentralization",
         
     | 
| 349 | 
         
            +
              "decentralise": "decentralize",
         
     | 
| 350 | 
         
            +
              "decentralised": "decentralized",
         
     | 
| 351 | 
         
            +
              "decentralises": "decentralizes",
         
     | 
| 352 | 
         
            +
              "decentralising": "decentralizing",
         
     | 
| 353 | 
         
            +
              "decriminalisation": "decriminalization",
         
     | 
| 354 | 
         
            +
              "decriminalise": "decriminalize",
         
     | 
| 355 | 
         
            +
              "decriminalised": "decriminalized",
         
     | 
| 356 | 
         
            +
              "decriminalises": "decriminalizes",
         
     | 
| 357 | 
         
            +
              "decriminalising": "decriminalizing",
         
     | 
| 358 | 
         
            +
              "defence": "defense",
         
     | 
| 359 | 
         
            +
              "defenceless": "defenseless",
         
     | 
| 360 | 
         
            +
              "defences": "defenses",
         
     | 
| 361 | 
         
            +
              "dehumanisation": "dehumanization",
         
     | 
| 362 | 
         
            +
              "dehumanise": "dehumanize",
         
     | 
| 363 | 
         
            +
              "dehumanised": "dehumanized",
         
     | 
| 364 | 
         
            +
              "dehumanises": "dehumanizes",
         
     | 
| 365 | 
         
            +
              "dehumanising": "dehumanizing",
         
     | 
| 366 | 
         
            +
              "demeanour": "demeanor",
         
     | 
| 367 | 
         
            +
              "demilitarisation": "demilitarization",
         
     | 
| 368 | 
         
            +
              "demilitarise": "demilitarize",
         
     | 
| 369 | 
         
            +
              "demilitarised": "demilitarized",
         
     | 
| 370 | 
         
            +
              "demilitarises": "demilitarizes",
         
     | 
| 371 | 
         
            +
              "demilitarising": "demilitarizing",
         
     | 
| 372 | 
         
            +
              "demobilisation": "demobilization",
         
     | 
| 373 | 
         
            +
              "demobilise": "demobilize",
         
     | 
| 374 | 
         
            +
              "demobilised": "demobilized",
         
     | 
| 375 | 
         
            +
              "demobilises": "demobilizes",
         
     | 
| 376 | 
         
            +
              "demobilising": "demobilizing",
         
     | 
| 377 | 
         
            +
              "democratisation": "democratization",
         
     | 
| 378 | 
         
            +
              "democratise": "democratize",
         
     | 
| 379 | 
         
            +
              "democratised": "democratized",
         
     | 
| 380 | 
         
            +
              "democratises": "democratizes",
         
     | 
| 381 | 
         
            +
              "democratising": "democratizing",
         
     | 
| 382 | 
         
            +
              "demonise": "demonize",
         
     | 
| 383 | 
         
            +
              "demonised": "demonized",
         
     | 
| 384 | 
         
            +
              "demonises": "demonizes",
         
     | 
| 385 | 
         
            +
              "demonising": "demonizing",
         
     | 
| 386 | 
         
            +
              "demoralisation": "demoralization",
         
     | 
| 387 | 
         
            +
              "demoralise": "demoralize",
         
     | 
| 388 | 
         
            +
              "demoralised": "demoralized",
         
     | 
| 389 | 
         
            +
              "demoralises": "demoralizes",
         
     | 
| 390 | 
         
            +
              "demoralising": "demoralizing",
         
     | 
| 391 | 
         
            +
              "denationalisation": "denationalization",
         
     | 
| 392 | 
         
            +
              "denationalise": "denationalize",
         
     | 
| 393 | 
         
            +
              "denationalised": "denationalized",
         
     | 
| 394 | 
         
            +
              "denationalises": "denationalizes",
         
     | 
| 395 | 
         
            +
              "denationalising": "denationalizing",
         
     | 
| 396 | 
         
            +
              "deodorise": "deodorize",
         
     | 
| 397 | 
         
            +
              "deodorised": "deodorized",
         
     | 
| 398 | 
         
            +
              "deodorises": "deodorizes",
         
     | 
| 399 | 
         
            +
              "deodorising": "deodorizing",
         
     | 
| 400 | 
         
            +
              "depersonalise": "depersonalize",
         
     | 
| 401 | 
         
            +
              "depersonalised": "depersonalized",
         
     | 
| 402 | 
         
            +
              "depersonalises": "depersonalizes",
         
     | 
| 403 | 
         
            +
              "depersonalising": "depersonalizing",
         
     | 
| 404 | 
         
            +
              "deputise": "deputize",
         
     | 
| 405 | 
         
            +
              "deputised": "deputized",
         
     | 
| 406 | 
         
            +
              "deputises": "deputizes",
         
     | 
| 407 | 
         
            +
              "deputising": "deputizing",
         
     | 
| 408 | 
         
            +
              "desensitisation": "desensitization",
         
     | 
| 409 | 
         
            +
              "desensitise": "desensitize",
         
     | 
| 410 | 
         
            +
              "desensitised": "desensitized",
         
     | 
| 411 | 
         
            +
              "desensitises": "desensitizes",
         
     | 
| 412 | 
         
            +
              "desensitising": "desensitizing",
         
     | 
| 413 | 
         
            +
              "destabilisation": "destabilization",
         
     | 
| 414 | 
         
            +
              "destabilise": "destabilize",
         
     | 
| 415 | 
         
            +
              "destabilised": "destabilized",
         
     | 
| 416 | 
         
            +
              "destabilises": "destabilizes",
         
     | 
| 417 | 
         
            +
              "destabilising": "destabilizing",
         
     | 
| 418 | 
         
            +
              "dialled": "dialed",
         
     | 
| 419 | 
         
            +
              "dialling": "dialing",
         
     | 
| 420 | 
         
            +
              "dialogue": "dialog",
         
     | 
| 421 | 
         
            +
              "dialogues": "dialogs",
         
     | 
| 422 | 
         
            +
              "diarrhoea": "diarrhea",
         
     | 
| 423 | 
         
            +
              "digitise": "digitize",
         
     | 
| 424 | 
         
            +
              "digitised": "digitized",
         
     | 
| 425 | 
         
            +
              "digitises": "digitizes",
         
     | 
| 426 | 
         
            +
              "digitising": "digitizing",
         
     | 
| 427 | 
         
            +
              "disc": "disk",
         
     | 
| 428 | 
         
            +
              "discolour": "discolor",
         
     | 
| 429 | 
         
            +
              "discoloured": "discolored",
         
     | 
| 430 | 
         
            +
              "discolouring": "discoloring",
         
     | 
| 431 | 
         
            +
              "discolours": "discolors",
         
     | 
| 432 | 
         
            +
              "discs": "disks",
         
     | 
| 433 | 
         
            +
              "disembowelled": "disemboweled",
         
     | 
| 434 | 
         
            +
              "disembowelling": "disemboweling",
         
     | 
| 435 | 
         
            +
              "disfavour": "disfavor",
         
     | 
| 436 | 
         
            +
              "dishevelled": "disheveled",
         
     | 
| 437 | 
         
            +
              "dishonour": "dishonor",
         
     | 
| 438 | 
         
            +
              "dishonourable": "dishonorable",
         
     | 
| 439 | 
         
            +
              "dishonourably": "dishonorably",
         
     | 
| 440 | 
         
            +
              "dishonoured": "dishonored",
         
     | 
| 441 | 
         
            +
              "dishonouring": "dishonoring",
         
     | 
| 442 | 
         
            +
              "dishonours": "dishonors",
         
     | 
| 443 | 
         
            +
              "disorganisation": "disorganization",
         
     | 
| 444 | 
         
            +
              "disorganised": "disorganized",
         
     | 
| 445 | 
         
            +
              "distil": "distill",
         
     | 
| 446 | 
         
            +
              "distils": "distills",
         
     | 
| 447 | 
         
            +
              "dramatisation": "dramatization",
         
     | 
| 448 | 
         
            +
              "dramatisations": "dramatizations",
         
     | 
| 449 | 
         
            +
              "dramatise": "dramatize",
         
     | 
| 450 | 
         
            +
              "dramatised": "dramatized",
         
     | 
| 451 | 
         
            +
              "dramatises": "dramatizes",
         
     | 
| 452 | 
         
            +
              "dramatising": "dramatizing",
         
     | 
| 453 | 
         
            +
              "draught": "draft",
         
     | 
| 454 | 
         
            +
              "draughtboard": "draftboard",
         
     | 
| 455 | 
         
            +
              "draughtboards": "draftboards",
         
     | 
| 456 | 
         
            +
              "draughtier": "draftier",
         
     | 
| 457 | 
         
            +
              "draughtiest": "draftiest",
         
     | 
| 458 | 
         
            +
              "draughts": "drafts",
         
     | 
| 459 | 
         
            +
              "draughtsman": "draftsman",
         
     | 
| 460 | 
         
            +
              "draughtsmanship": "draftsmanship",
         
     | 
| 461 | 
         
            +
              "draughtsmen": "draftsmen",
         
     | 
| 462 | 
         
            +
              "draughtswoman": "draftswoman",
         
     | 
| 463 | 
         
            +
              "draughtswomen": "draftswomen",
         
     | 
| 464 | 
         
            +
              "draughty": "drafty",
         
     | 
| 465 | 
         
            +
              "drivelled": "driveled",
         
     | 
| 466 | 
         
            +
              "drivelling": "driveling",
         
     | 
| 467 | 
         
            +
              "duelled": "dueled",
         
     | 
| 468 | 
         
            +
              "duelling": "dueling",
         
     | 
| 469 | 
         
            +
              "economise": "economize",
         
     | 
| 470 | 
         
            +
              "economised": "economized",
         
     | 
| 471 | 
         
            +
              "economises": "economizes",
         
     | 
| 472 | 
         
            +
              "economising": "economizing",
         
     | 
| 473 | 
         
            +
              "editorialise": "editorialize",
         
     | 
| 474 | 
         
            +
              "editorialised": "editorialized",
         
     | 
| 475 | 
         
            +
              "editorialises": "editorializes",
         
     | 
| 476 | 
         
            +
              "editorialising": "editorializing",
         
     | 
| 477 | 
         
            +
              "edoema": "edema",
         
     | 
| 478 | 
         
            +
              "empathise": "empathize",
         
     | 
| 479 | 
         
            +
              "empathised": "empathized",
         
     | 
| 480 | 
         
            +
              "empathises": "empathizes",
         
     | 
| 481 | 
         
            +
              "empathising": "empathizing",
         
     | 
| 482 | 
         
            +
              "emphasise": "emphasize",
         
     | 
| 483 | 
         
            +
              "emphasised": "emphasized",
         
     | 
| 484 | 
         
            +
              "emphasises": "emphasizes",
         
     | 
| 485 | 
         
            +
              "emphasising": "emphasizing",
         
     | 
| 486 | 
         
            +
              "enamelled": "enameled",
         
     | 
| 487 | 
         
            +
              "enamelling": "enameling",
         
     | 
| 488 | 
         
            +
              "enamoured": "enamored",
         
     | 
| 489 | 
         
            +
              "encyclopaedia": "encyclopedia",
         
     | 
| 490 | 
         
            +
              "encyclopaedias": "encyclopedias",
         
     | 
| 491 | 
         
            +
              "encyclopaedic": "encyclopedic",
         
     | 
| 492 | 
         
            +
              "endeavour": "endeavor",
         
     | 
| 493 | 
         
            +
              "endeavoured": "endeavored",
         
     | 
| 494 | 
         
            +
              "endeavouring": "endeavoring",
         
     | 
| 495 | 
         
            +
              "endeavours": "endeavors",
         
     | 
| 496 | 
         
            +
              "energise": "energize",
         
     | 
| 497 | 
         
            +
              "energised": "energized",
         
     | 
| 498 | 
         
            +
              "energises": "energizes",
         
     | 
| 499 | 
         
            +
              "energising": "energizing",
         
     | 
| 500 | 
         
            +
              "enrol": "enroll",
         
     | 
| 501 | 
         
            +
              "enrols": "enrolls",
         
     | 
| 502 | 
         
            +
              "enthral": "enthrall",
         
     | 
| 503 | 
         
            +
              "enthrals": "enthralls",
         
     | 
| 504 | 
         
            +
              "epaulette": "epaulet",
         
     | 
| 505 | 
         
            +
              "epaulettes": "epaulets",
         
     | 
| 506 | 
         
            +
              "epicentre": "epicenter",
         
     | 
| 507 | 
         
            +
              "epicentres": "epicenters",
         
     | 
| 508 | 
         
            +
              "epilogue": "epilog",
         
     | 
| 509 | 
         
            +
              "epilogues": "epilogs",
         
     | 
| 510 | 
         
            +
              "epitomise": "epitomize",
         
     | 
| 511 | 
         
            +
              "epitomised": "epitomized",
         
     | 
| 512 | 
         
            +
              "epitomises": "epitomizes",
         
     | 
| 513 | 
         
            +
              "epitomising": "epitomizing",
         
     | 
| 514 | 
         
            +
              "equalisation": "equalization",
         
     | 
| 515 | 
         
            +
              "equalise": "equalize",
         
     | 
| 516 | 
         
            +
              "equalised": "equalized",
         
     | 
| 517 | 
         
            +
              "equaliser": "equalizer",
         
     | 
| 518 | 
         
            +
              "equalisers": "equalizers",
         
     | 
| 519 | 
         
            +
              "equalises": "equalizes",
         
     | 
| 520 | 
         
            +
              "equalising": "equalizing",
         
     | 
| 521 | 
         
            +
              "eulogise": "eulogize",
         
     | 
| 522 | 
         
            +
              "eulogised": "eulogized",
         
     | 
| 523 | 
         
            +
              "eulogises": "eulogizes",
         
     | 
| 524 | 
         
            +
              "eulogising": "eulogizing",
         
     | 
| 525 | 
         
            +
              "evangelise": "evangelize",
         
     | 
| 526 | 
         
            +
              "evangelised": "evangelized",
         
     | 
| 527 | 
         
            +
              "evangelises": "evangelizes",
         
     | 
| 528 | 
         
            +
              "evangelising": "evangelizing",
         
     | 
| 529 | 
         
            +
              "exorcise": "exorcize",
         
     | 
| 530 | 
         
            +
              "exorcised": "exorcized",
         
     | 
| 531 | 
         
            +
              "exorcises": "exorcizes",
         
     | 
| 532 | 
         
            +
              "exorcising": "exorcizing",
         
     | 
| 533 | 
         
            +
              "extemporisation": "extemporization",
         
     | 
| 534 | 
         
            +
              "extemporise": "extemporize",
         
     | 
| 535 | 
         
            +
              "extemporised": "extemporized",
         
     | 
| 536 | 
         
            +
              "extemporises": "extemporizes",
         
     | 
| 537 | 
         
            +
              "extemporising": "extemporizing",
         
     | 
| 538 | 
         
            +
              "externalisation": "externalization",
         
     | 
| 539 | 
         
            +
              "externalisations": "externalizations",
         
     | 
| 540 | 
         
            +
              "externalise": "externalize",
         
     | 
| 541 | 
         
            +
              "externalised": "externalized",
         
     | 
| 542 | 
         
            +
              "externalises": "externalizes",
         
     | 
| 543 | 
         
            +
              "externalising": "externalizing",
         
     | 
| 544 | 
         
            +
              "factorise": "factorize",
         
     | 
| 545 | 
         
            +
              "factorised": "factorized",
         
     | 
| 546 | 
         
            +
              "factorises": "factorizes",
         
     | 
| 547 | 
         
            +
              "factorising": "factorizing",
         
     | 
| 548 | 
         
            +
              "faecal": "fecal",
         
     | 
| 549 | 
         
            +
              "faeces": "feces",
         
     | 
| 550 | 
         
            +
              "familiarisation": "familiarization",
         
     | 
| 551 | 
         
            +
              "familiarise": "familiarize",
         
     | 
| 552 | 
         
            +
              "familiarised": "familiarized",
         
     | 
| 553 | 
         
            +
              "familiarises": "familiarizes",
         
     | 
| 554 | 
         
            +
              "familiarising": "familiarizing",
         
     | 
| 555 | 
         
            +
              "fantasise": "fantasize",
         
     | 
| 556 | 
         
            +
              "fantasised": "fantasized",
         
     | 
| 557 | 
         
            +
              "fantasises": "fantasizes",
         
     | 
| 558 | 
         
            +
              "fantasising": "fantasizing",
         
     | 
| 559 | 
         
            +
              "favour": "favor",
         
     | 
| 560 | 
         
            +
              "favourable": "favorable",
         
     | 
| 561 | 
         
            +
              "favourably": "favorably",
         
     | 
| 562 | 
         
            +
              "favoured": "favored",
         
     | 
| 563 | 
         
            +
              "favouring": "favoring",
         
     | 
| 564 | 
         
            +
              "favourite": "favorite",
         
     | 
| 565 | 
         
            +
              "favourites": "favorites",
         
     | 
| 566 | 
         
            +
              "favouritism": "favoritism",
         
     | 
| 567 | 
         
            +
              "favours": "favors",
         
     | 
| 568 | 
         
            +
              "feminise": "feminize",
         
     | 
| 569 | 
         
            +
              "feminised": "feminized",
         
     | 
| 570 | 
         
            +
              "feminises": "feminizes",
         
     | 
| 571 | 
         
            +
              "feminising": "feminizing",
         
     | 
| 572 | 
         
            +
              "fertilisation": "fertilization",
         
     | 
| 573 | 
         
            +
              "fertilise": "fertilize",
         
     | 
| 574 | 
         
            +
              "fertilised": "fertilized",
         
     | 
| 575 | 
         
            +
              "fertiliser": "fertilizer",
         
     | 
| 576 | 
         
            +
              "fertilisers": "fertilizers",
         
     | 
| 577 | 
         
            +
              "fertilises": "fertilizes",
         
     | 
| 578 | 
         
            +
              "fertilising": "fertilizing",
         
     | 
| 579 | 
         
            +
              "fervour": "fervor",
         
     | 
| 580 | 
         
            +
              "fibre": "fiber",
         
     | 
| 581 | 
         
            +
              "fibreglass": "fiberglass",
         
     | 
| 582 | 
         
            +
              "fibres": "fibers",
         
     | 
| 583 | 
         
            +
              "fictionalisation": "fictionalization",
         
     | 
| 584 | 
         
            +
              "fictionalisations": "fictionalizations",
         
     | 
| 585 | 
         
            +
              "fictionalise": "fictionalize",
         
     | 
| 586 | 
         
            +
              "fictionalised": "fictionalized",
         
     | 
| 587 | 
         
            +
              "fictionalises": "fictionalizes",
         
     | 
| 588 | 
         
            +
              "fictionalising": "fictionalizing",
         
     | 
| 589 | 
         
            +
              "fillet": "filet",
         
     | 
| 590 | 
         
            +
              "filleted": "fileted",
         
     | 
| 591 | 
         
            +
              "filleting": "fileting",
         
     | 
| 592 | 
         
            +
              "fillets": "filets",
         
     | 
| 593 | 
         
            +
              "finalisation": "finalization",
         
     | 
| 594 | 
         
            +
              "finalise": "finalize",
         
     | 
| 595 | 
         
            +
              "finalised": "finalized",
         
     | 
| 596 | 
         
            +
              "finalises": "finalizes",
         
     | 
| 597 | 
         
            +
              "finalising": "finalizing",
         
     | 
| 598 | 
         
            +
              "flautist": "flutist",
         
     | 
| 599 | 
         
            +
              "flautists": "flutists",
         
     | 
| 600 | 
         
            +
              "flavour": "flavor",
         
     | 
| 601 | 
         
            +
              "flavoured": "flavored",
         
     | 
| 602 | 
         
            +
              "flavouring": "flavoring",
         
     | 
| 603 | 
         
            +
              "flavourings": "flavorings",
         
     | 
| 604 | 
         
            +
              "flavourless": "flavorless",
         
     | 
| 605 | 
         
            +
              "flavours": "flavors",
         
     | 
| 606 | 
         
            +
              "flavoursome": "flavorsome",
         
     | 
| 607 | 
         
            +
              "flyer / flier": "flier / flyer",
         
     | 
| 608 | 
         
            +
              "foetal": "fetal",
         
     | 
| 609 | 
         
            +
              "foetid": "fetid",
         
     | 
| 610 | 
         
            +
              "foetus": "fetus",
         
     | 
| 611 | 
         
            +
              "foetuses": "fetuses",
         
     | 
| 612 | 
         
            +
              "formalisation": "formalization",
         
     | 
| 613 | 
         
            +
              "formalise": "formalize",
         
     | 
| 614 | 
         
            +
              "formalised": "formalized",
         
     | 
| 615 | 
         
            +
              "formalises": "formalizes",
         
     | 
| 616 | 
         
            +
              "formalising": "formalizing",
         
     | 
| 617 | 
         
            +
              "fossilisation": "fossilization",
         
     | 
| 618 | 
         
            +
              "fossilise": "fossilize",
         
     | 
| 619 | 
         
            +
              "fossilised": "fossilized",
         
     | 
| 620 | 
         
            +
              "fossilises": "fossilizes",
         
     | 
| 621 | 
         
            +
              "fossilising": "fossilizing",
         
     | 
| 622 | 
         
            +
              "fraternisation": "fraternization",
         
     | 
| 623 | 
         
            +
              "fraternise": "fraternize",
         
     | 
| 624 | 
         
            +
              "fraternised": "fraternized",
         
     | 
| 625 | 
         
            +
              "fraternises": "fraternizes",
         
     | 
| 626 | 
         
            +
              "fraternising": "fraternizing",
         
     | 
| 627 | 
         
            +
              "fulfil": "fulfill",
         
     | 
| 628 | 
         
            +
              "fulfilment": "fulfillment",
         
     | 
| 629 | 
         
            +
              "fulfils": "fulfills",
         
     | 
| 630 | 
         
            +
              "funnelled": "funneled",
         
     | 
| 631 | 
         
            +
              "funnelling": "funneling",
         
     | 
| 632 | 
         
            +
              "gage": "gauge",
         
     | 
| 633 | 
         
            +
              "gaged": "gauged",
         
     | 
| 634 | 
         
            +
              "gages": "gauges",
         
     | 
| 635 | 
         
            +
              "gaging": "gauging",
         
     | 
| 636 | 
         
            +
              "galvanise": "galvanize",
         
     | 
| 637 | 
         
            +
              "galvanised": "galvanized",
         
     | 
| 638 | 
         
            +
              "galvanises": "galvanizes",
         
     | 
| 639 | 
         
            +
              "galvanising": "galvanizing",
         
     | 
| 640 | 
         
            +
              "gambolled": "gamboled",
         
     | 
| 641 | 
         
            +
              "gambolling": "gamboling",
         
     | 
| 642 | 
         
            +
              "gaol": "jail",
         
     | 
| 643 | 
         
            +
              "gaolbird": "jailbird",
         
     | 
| 644 | 
         
            +
              "gaolbirds": "jailbirds",
         
     | 
| 645 | 
         
            +
              "gaolbreak": "jailbreak",
         
     | 
| 646 | 
         
            +
              "gaolbreaks": "jailbreaks",
         
     | 
| 647 | 
         
            +
              "gaoled": "jailed",
         
     | 
| 648 | 
         
            +
              "gaoler": "jailer",
         
     | 
| 649 | 
         
            +
              "gaolers": "jailers",
         
     | 
| 650 | 
         
            +
              "gaoling": "jailing",
         
     | 
| 651 | 
         
            +
              "gaols": "jails",
         
     | 
| 652 | 
         
            +
              "gasses": "gases",
         
     | 
| 653 | 
         
            +
              "generalisation": "generalization",
         
     | 
| 654 | 
         
            +
              "generalisations": "generalizations",
         
     | 
| 655 | 
         
            +
              "generalise": "generalize",
         
     | 
| 656 | 
         
            +
              "generalised": "generalized",
         
     | 
| 657 | 
         
            +
              "generalises": "generalizes",
         
     | 
| 658 | 
         
            +
              "generalising": "generalizing",
         
     | 
| 659 | 
         
            +
              "ghettoise": "ghettoize",
         
     | 
| 660 | 
         
            +
              "ghettoised": "ghettoized",
         
     | 
| 661 | 
         
            +
              "ghettoises": "ghettoizes",
         
     | 
| 662 | 
         
            +
              "ghettoising": "ghettoizing",
         
     | 
| 663 | 
         
            +
              "gipsies": "gypsies",
         
     | 
| 664 | 
         
            +
              "glamor": "glamour",
         
     | 
| 665 | 
         
            +
              "glamorise": "glamorize",
         
     | 
| 666 | 
         
            +
              "glamorised": "glamorized",
         
     | 
| 667 | 
         
            +
              "glamorises": "glamorizes",
         
     | 
| 668 | 
         
            +
              "glamorising": "glamorizing",
         
     | 
| 669 | 
         
            +
              "globalisation": "globalization",
         
     | 
| 670 | 
         
            +
              "globalise": "globalize",
         
     | 
| 671 | 
         
            +
              "globalised": "globalized",
         
     | 
| 672 | 
         
            +
              "globalises": "globalizes",
         
     | 
| 673 | 
         
            +
              "globalising": "globalizing",
         
     | 
| 674 | 
         
            +
              "glueing": "gluing",
         
     | 
| 675 | 
         
            +
              "goitre": "goiter",
         
     | 
| 676 | 
         
            +
              "goitres": "goiters",
         
     | 
| 677 | 
         
            +
              "gonorrhoea": "gonorrhea",
         
     | 
| 678 | 
         
            +
              "gramme": "gram",
         
     | 
| 679 | 
         
            +
              "grammes": "grams",
         
     | 
| 680 | 
         
            +
              "gravelled": "graveled",
         
     | 
| 681 | 
         
            +
              "grey": "gray",
         
     | 
| 682 | 
         
            +
              "greyed": "grayed",
         
     | 
| 683 | 
         
            +
              "greying": "graying",
         
     | 
| 684 | 
         
            +
              "greyish": "grayish",
         
     | 
| 685 | 
         
            +
              "greyness": "grayness",
         
     | 
| 686 | 
         
            +
              "greys": "grays",
         
     | 
| 687 | 
         
            +
              "grovelled": "groveled",
         
     | 
| 688 | 
         
            +
              "grovelling": "groveling",
         
     | 
| 689 | 
         
            +
              "groyne": "groin",
         
     | 
| 690 | 
         
            +
              "groynes": "groins",
         
     | 
| 691 | 
         
            +
              "gruelling": "grueling",
         
     | 
| 692 | 
         
            +
              "gruellingly": "gruelingly",
         
     | 
| 693 | 
         
            +
              "gryphon": "griffin",
         
     | 
| 694 | 
         
            +
              "gryphons": "griffins",
         
     | 
| 695 | 
         
            +
              "gynaecological": "gynecological",
         
     | 
| 696 | 
         
            +
              "gynaecologist": "gynecologist",
         
     | 
| 697 | 
         
            +
              "gynaecologists": "gynecologists",
         
     | 
| 698 | 
         
            +
              "gynaecology": "gynecology",
         
     | 
| 699 | 
         
            +
              "haematological": "hematological",
         
     | 
| 700 | 
         
            +
              "haematologist": "hematologist",
         
     | 
| 701 | 
         
            +
              "haematologists": "hematologists",
         
     | 
| 702 | 
         
            +
              "haematology": "hematology",
         
     | 
| 703 | 
         
            +
              "haemoglobin": "hemoglobin",
         
     | 
| 704 | 
         
            +
              "haemophilia": "hemophilia",
         
     | 
| 705 | 
         
            +
              "haemophiliac": "hemophiliac",
         
     | 
| 706 | 
         
            +
              "haemophiliacs": "hemophiliacs",
         
     | 
| 707 | 
         
            +
              "haemorrhage": "hemorrhage",
         
     | 
| 708 | 
         
            +
              "haemorrhaged": "hemorrhaged",
         
     | 
| 709 | 
         
            +
              "haemorrhages": "hemorrhages",
         
     | 
| 710 | 
         
            +
              "haemorrhaging": "hemorrhaging",
         
     | 
| 711 | 
         
            +
              "haemorrhoids": "hemorrhoids",
         
     | 
| 712 | 
         
            +
              "harbour": "harbor",
         
     | 
| 713 | 
         
            +
              "harboured": "harbored",
         
     | 
| 714 | 
         
            +
              "harbouring": "harboring",
         
     | 
| 715 | 
         
            +
              "harbours": "harbors",
         
     | 
| 716 | 
         
            +
              "harmonisation": "harmonization",
         
     | 
| 717 | 
         
            +
              "harmonise": "harmonize",
         
     | 
| 718 | 
         
            +
              "harmonised": "harmonized",
         
     | 
| 719 | 
         
            +
              "harmonises": "harmonizes",
         
     | 
| 720 | 
         
            +
              "harmonising": "harmonizing",
         
     | 
| 721 | 
         
            +
              "homoeopath": "homeopath",
         
     | 
| 722 | 
         
            +
              "homoeopathic": "homeopathic",
         
     | 
| 723 | 
         
            +
              "homoeopaths": "homeopaths",
         
     | 
| 724 | 
         
            +
              "homoeopathy": "homeopathy",
         
     | 
| 725 | 
         
            +
              "homogenise": "homogenize",
         
     | 
| 726 | 
         
            +
              "homogenised": "homogenized",
         
     | 
| 727 | 
         
            +
              "homogenises": "homogenizes",
         
     | 
| 728 | 
         
            +
              "homogenising": "homogenizing",
         
     | 
| 729 | 
         
            +
              "honour": "honor",
         
     | 
| 730 | 
         
            +
              "honourable": "honorable",
         
     | 
| 731 | 
         
            +
              "honourably": "honorably",
         
     | 
| 732 | 
         
            +
              "honoured": "honored",
         
     | 
| 733 | 
         
            +
              "honouring": "honoring",
         
     | 
| 734 | 
         
            +
              "honours": "honors",
         
     | 
| 735 | 
         
            +
              "hospitalisation": "hospitalization",
         
     | 
| 736 | 
         
            +
              "hospitalise": "hospitalize",
         
     | 
| 737 | 
         
            +
              "hospitalised": "hospitalized",
         
     | 
| 738 | 
         
            +
              "hospitalises": "hospitalizes",
         
     | 
| 739 | 
         
            +
              "hospitalising": "hospitalizing",
         
     | 
| 740 | 
         
            +
              "humanise": "humanize",
         
     | 
| 741 | 
         
            +
              "humanised": "humanized",
         
     | 
| 742 | 
         
            +
              "humanises": "humanizes",
         
     | 
| 743 | 
         
            +
              "humanising": "humanizing",
         
     | 
| 744 | 
         
            +
              "humour": "humor",
         
     | 
| 745 | 
         
            +
              "humoured": "humored",
         
     | 
| 746 | 
         
            +
              "humouring": "humoring",
         
     | 
| 747 | 
         
            +
              "humourless": "humorless",
         
     | 
| 748 | 
         
            +
              "humours": "humors",
         
     | 
| 749 | 
         
            +
              "hybridise": "hybridize",
         
     | 
| 750 | 
         
            +
              "hybridised": "hybridized",
         
     | 
| 751 | 
         
            +
              "hybridises": "hybridizes",
         
     | 
| 752 | 
         
            +
              "hybridising": "hybridizing",
         
     | 
| 753 | 
         
            +
              "hypnotise": "hypnotize",
         
     | 
| 754 | 
         
            +
              "hypnotised": "hypnotized",
         
     | 
| 755 | 
         
            +
              "hypnotises": "hypnotizes",
         
     | 
| 756 | 
         
            +
              "hypnotising": "hypnotizing",
         
     | 
| 757 | 
         
            +
              "hypothesise": "hypothesize",
         
     | 
| 758 | 
         
            +
              "hypothesised": "hypothesized",
         
     | 
| 759 | 
         
            +
              "hypothesises": "hypothesizes",
         
     | 
| 760 | 
         
            +
              "hypothesising": "hypothesizing",
         
     | 
| 761 | 
         
            +
              "idealisation": "idealization",
         
     | 
| 762 | 
         
            +
              "idealise": "idealize",
         
     | 
| 763 | 
         
            +
              "idealised": "idealized",
         
     | 
| 764 | 
         
            +
              "idealises": "idealizes",
         
     | 
| 765 | 
         
            +
              "idealising": "idealizing",
         
     | 
| 766 | 
         
            +
              "idolise": "idolize",
         
     | 
| 767 | 
         
            +
              "idolised": "idolized",
         
     | 
| 768 | 
         
            +
              "idolises": "idolizes",
         
     | 
| 769 | 
         
            +
              "idolising": "idolizing",
         
     | 
| 770 | 
         
            +
              "immobilisation": "immobilization",
         
     | 
| 771 | 
         
            +
              "immobilise": "immobilize",
         
     | 
| 772 | 
         
            +
              "immobilised": "immobilized",
         
     | 
| 773 | 
         
            +
              "immobiliser": "immobilizer",
         
     | 
| 774 | 
         
            +
              "immobilisers": "immobilizers",
         
     | 
| 775 | 
         
            +
              "immobilises": "immobilizes",
         
     | 
| 776 | 
         
            +
              "immobilising": "immobilizing",
         
     | 
| 777 | 
         
            +
              "immortalise": "immortalize",
         
     | 
| 778 | 
         
            +
              "immortalised": "immortalized",
         
     | 
| 779 | 
         
            +
              "immortalises": "immortalizes",
         
     | 
| 780 | 
         
            +
              "immortalising": "immortalizing",
         
     | 
| 781 | 
         
            +
              "immunisation": "immunization",
         
     | 
| 782 | 
         
            +
              "immunise": "immunize",
         
     | 
| 783 | 
         
            +
              "immunised": "immunized",
         
     | 
| 784 | 
         
            +
              "immunises": "immunizes",
         
     | 
| 785 | 
         
            +
              "immunising": "immunizing",
         
     | 
| 786 | 
         
            +
              "impanelled": "impaneled",
         
     | 
| 787 | 
         
            +
              "impanelling": "impaneling",
         
     | 
| 788 | 
         
            +
              "imperilled": "imperiled",
         
     | 
| 789 | 
         
            +
              "imperilling": "imperiling",
         
     | 
| 790 | 
         
            +
              "individualise": "individualize",
         
     | 
| 791 | 
         
            +
              "individualised": "individualized",
         
     | 
| 792 | 
         
            +
              "individualises": "individualizes",
         
     | 
| 793 | 
         
            +
              "individualising": "individualizing",
         
     | 
| 794 | 
         
            +
              "industrialise": "industrialize",
         
     | 
| 795 | 
         
            +
              "industrialised": "industrialized",
         
     | 
| 796 | 
         
            +
              "industrialises": "industrializes",
         
     | 
| 797 | 
         
            +
              "industrialising": "industrializing",
         
     | 
| 798 | 
         
            +
              "inflexion": "inflection",
         
     | 
| 799 | 
         
            +
              "inflexions": "inflections",
         
     | 
| 800 | 
         
            +
              "initialise": "initialize",
         
     | 
| 801 | 
         
            +
              "initialised": "initialized",
         
     | 
| 802 | 
         
            +
              "initialises": "initializes",
         
     | 
| 803 | 
         
            +
              "initialising": "initializing",
         
     | 
| 804 | 
         
            +
              "initialled": "initialed",
         
     | 
| 805 | 
         
            +
              "initialling": "initialing",
         
     | 
| 806 | 
         
            +
              "instal": "install",
         
     | 
| 807 | 
         
            +
              "instalment": "installment",
         
     | 
| 808 | 
         
            +
              "instalments": "installments",
         
     | 
| 809 | 
         
            +
              "instals": "installs",
         
     | 
| 810 | 
         
            +
              "instil": "instill",
         
     | 
| 811 | 
         
            +
              "instils": "instills",
         
     | 
| 812 | 
         
            +
              "institutionalisation": "institutionalization",
         
     | 
| 813 | 
         
            +
              "institutionalise": "institutionalize",
         
     | 
| 814 | 
         
            +
              "institutionalised": "institutionalized",
         
     | 
| 815 | 
         
            +
              "institutionalises": "institutionalizes",
         
     | 
| 816 | 
         
            +
              "institutionalising": "institutionalizing",
         
     | 
| 817 | 
         
            +
              "intellectualise": "intellectualize",
         
     | 
| 818 | 
         
            +
              "intellectualised": "intellectualized",
         
     | 
| 819 | 
         
            +
              "intellectualises": "intellectualizes",
         
     | 
| 820 | 
         
            +
              "intellectualising": "intellectualizing",
         
     | 
| 821 | 
         
            +
              "internalisation": "internalization",
         
     | 
| 822 | 
         
            +
              "internalise": "internalize",
         
     | 
| 823 | 
         
            +
              "internalised": "internalized",
         
     | 
| 824 | 
         
            +
              "internalises": "internalizes",
         
     | 
| 825 | 
         
            +
              "internalising": "internalizing",
         
     | 
| 826 | 
         
            +
              "internationalisation": "internationalization",
         
     | 
| 827 | 
         
            +
              "internationalise": "internationalize",
         
     | 
| 828 | 
         
            +
              "internationalised": "internationalized",
         
     | 
| 829 | 
         
            +
              "internationalises": "internationalizes",
         
     | 
| 830 | 
         
            +
              "internationalising": "internationalizing",
         
     | 
| 831 | 
         
            +
              "ionisation": "ionization",
         
     | 
| 832 | 
         
            +
              "ionise": "ionize",
         
     | 
| 833 | 
         
            +
              "ionised": "ionized",
         
     | 
| 834 | 
         
            +
              "ioniser": "ionizer",
         
     | 
| 835 | 
         
            +
              "ionisers": "ionizers",
         
     | 
| 836 | 
         
            +
              "ionises": "ionizes",
         
     | 
| 837 | 
         
            +
              "ionising": "ionizing",
         
     | 
| 838 | 
         
            +
              "italicise": "italicize",
         
     | 
| 839 | 
         
            +
              "italicised": "italicized",
         
     | 
| 840 | 
         
            +
              "italicises": "italicizes",
         
     | 
| 841 | 
         
            +
              "italicising": "italicizing",
         
     | 
| 842 | 
         
            +
              "itemise": "itemize",
         
     | 
| 843 | 
         
            +
              "itemised": "itemized",
         
     | 
| 844 | 
         
            +
              "itemises": "itemizes",
         
     | 
| 845 | 
         
            +
              "itemising": "itemizing",
         
     | 
| 846 | 
         
            +
              "jeopardise": "jeopardize",
         
     | 
| 847 | 
         
            +
              "jeopardised": "jeopardized",
         
     | 
| 848 | 
         
            +
              "jeopardises": "jeopardizes",
         
     | 
| 849 | 
         
            +
              "jeopardising": "jeopardizing",
         
     | 
| 850 | 
         
            +
              "jewelled": "jeweled",
         
     | 
| 851 | 
         
            +
              "jeweller": "jeweler",
         
     | 
| 852 | 
         
            +
              "jewellers": "jewelers",
         
     | 
| 853 | 
         
            +
              "jewellery": "jewelry",
         
     | 
| 854 | 
         
            +
              "judgement": "judgment",
         
     | 
| 855 | 
         
            +
              "kilogramme": "kilogram",
         
     | 
| 856 | 
         
            +
              "kilogrammes": "kilograms",
         
     | 
| 857 | 
         
            +
              "kilometre": "kilometer",
         
     | 
| 858 | 
         
            +
              "kilometres": "kilometers",
         
     | 
| 859 | 
         
            +
              "labelled": "labeled",
         
     | 
| 860 | 
         
            +
              "labelling": "labeling",
         
     | 
| 861 | 
         
            +
              "labour": "labor",
         
     | 
| 862 | 
         
            +
              "laboured": "labored",
         
     | 
| 863 | 
         
            +
              "labourer": "laborer",
         
     | 
| 864 | 
         
            +
              "labourers": "laborers",
         
     | 
| 865 | 
         
            +
              "labouring": "laboring",
         
     | 
| 866 | 
         
            +
              "labours": "labors",
         
     | 
| 867 | 
         
            +
              "lacklustre": "lackluster",
         
     | 
| 868 | 
         
            +
              "legalisation": "legalization",
         
     | 
| 869 | 
         
            +
              "legalise": "legalize",
         
     | 
| 870 | 
         
            +
              "legalised": "legalized",
         
     | 
| 871 | 
         
            +
              "legalises": "legalizes",
         
     | 
| 872 | 
         
            +
              "legalising": "legalizing",
         
     | 
| 873 | 
         
            +
              "legitimise": "legitimize",
         
     | 
| 874 | 
         
            +
              "legitimised": "legitimized",
         
     | 
| 875 | 
         
            +
              "legitimises": "legitimizes",
         
     | 
| 876 | 
         
            +
              "legitimising": "legitimizing",
         
     | 
| 877 | 
         
            +
              "leukaemia": "leukemia",
         
     | 
| 878 | 
         
            +
              "levelled": "leveled",
         
     | 
| 879 | 
         
            +
              "leveller": "leveler",
         
     | 
| 880 | 
         
            +
              "levellers": "levelers",
         
     | 
| 881 | 
         
            +
              "levelling": "leveling",
         
     | 
| 882 | 
         
            +
              "libelled": "libeled",
         
     | 
| 883 | 
         
            +
              "libelling": "libeling",
         
     | 
| 884 | 
         
            +
              "libellous": "libelous",
         
     | 
| 885 | 
         
            +
              "liberalisation": "liberalization",
         
     | 
| 886 | 
         
            +
              "liberalise": "liberalize",
         
     | 
| 887 | 
         
            +
              "liberalised": "liberalized",
         
     | 
| 888 | 
         
            +
              "liberalises": "liberalizes",
         
     | 
| 889 | 
         
            +
              "liberalising": "liberalizing",
         
     | 
| 890 | 
         
            +
              "licence": "license",
         
     | 
| 891 | 
         
            +
              "licenced": "licensed",
         
     | 
| 892 | 
         
            +
              "licences": "licenses",
         
     | 
| 893 | 
         
            +
              "licencing": "licensing",
         
     | 
| 894 | 
         
            +
              "likeable": "likable",
         
     | 
| 895 | 
         
            +
              "lionisation": "lionization",
         
     | 
| 896 | 
         
            +
              "lionise": "lionize",
         
     | 
| 897 | 
         
            +
              "lionised": "lionized",
         
     | 
| 898 | 
         
            +
              "lionises": "lionizes",
         
     | 
| 899 | 
         
            +
              "lionising": "lionizing",
         
     | 
| 900 | 
         
            +
              "liquidise": "liquidize",
         
     | 
| 901 | 
         
            +
              "liquidised": "liquidized",
         
     | 
| 902 | 
         
            +
              "liquidiser": "liquidizer",
         
     | 
| 903 | 
         
            +
              "liquidisers": "liquidizers",
         
     | 
| 904 | 
         
            +
              "liquidises": "liquidizes",
         
     | 
| 905 | 
         
            +
              "liquidising": "liquidizing",
         
     | 
| 906 | 
         
            +
              "litre": "liter",
         
     | 
| 907 | 
         
            +
              "litres": "liters",
         
     | 
| 908 | 
         
            +
              "localise": "localize",
         
     | 
| 909 | 
         
            +
              "localised": "localized",
         
     | 
| 910 | 
         
            +
              "localises": "localizes",
         
     | 
| 911 | 
         
            +
              "localising": "localizing",
         
     | 
| 912 | 
         
            +
              "louvre": "louver",
         
     | 
| 913 | 
         
            +
              "louvred": "louvered",
         
     | 
| 914 | 
         
            +
              "louvres": "louvers",
         
     | 
| 915 | 
         
            +
              "lustre": "luster",
         
     | 
| 916 | 
         
            +
              "magnetise": "magnetize",
         
     | 
| 917 | 
         
            +
              "magnetised": "magnetized",
         
     | 
| 918 | 
         
            +
              "magnetises": "magnetizes",
         
     | 
| 919 | 
         
            +
              "magnetising": "magnetizing",
         
     | 
| 920 | 
         
            +
              "manoeuvrability": "maneuverability",
         
     | 
| 921 | 
         
            +
              "manoeuvrable": "maneuverable",
         
     | 
| 922 | 
         
            +
              "manoeuvre": "maneuver",
         
     | 
| 923 | 
         
            +
              "manoeuvred": "maneuvered",
         
     | 
| 924 | 
         
            +
              "manoeuvres": "maneuvers",
         
     | 
| 925 | 
         
            +
              "manoeuvring": "maneuvering",
         
     | 
| 926 | 
         
            +
              "manoeuvrings": "maneuverings",
         
     | 
| 927 | 
         
            +
              "marginalisation": "marginalization",
         
     | 
| 928 | 
         
            +
              "marginalise": "marginalize",
         
     | 
| 929 | 
         
            +
              "marginalised": "marginalized",
         
     | 
| 930 | 
         
            +
              "marginalises": "marginalizes",
         
     | 
| 931 | 
         
            +
              "marginalising": "marginalizing",
         
     | 
| 932 | 
         
            +
              "marshalled": "marshaled",
         
     | 
| 933 | 
         
            +
              "marshalling": "marshaling",
         
     | 
| 934 | 
         
            +
              "marvelled": "marveled",
         
     | 
| 935 | 
         
            +
              "marvelling": "marveling",
         
     | 
| 936 | 
         
            +
              "marvellous": "marvelous",
         
     | 
| 937 | 
         
            +
              "marvellously": "marvelously",
         
     | 
| 938 | 
         
            +
              "materialisation": "materialization",
         
     | 
| 939 | 
         
            +
              "materialise": "materialize",
         
     | 
| 940 | 
         
            +
              "materialised": "materialized",
         
     | 
| 941 | 
         
            +
              "materialises": "materializes",
         
     | 
| 942 | 
         
            +
              "materialising": "materializing",
         
     | 
| 943 | 
         
            +
              "maximisation": "maximization",
         
     | 
| 944 | 
         
            +
              "maximise": "maximize",
         
     | 
| 945 | 
         
            +
              "maximised": "maximized",
         
     | 
| 946 | 
         
            +
              "maximises": "maximizes",
         
     | 
| 947 | 
         
            +
              "maximising": "maximizing",
         
     | 
| 948 | 
         
            +
              "meagre": "meager",
         
     | 
| 949 | 
         
            +
              "mechanisation": "mechanization",
         
     | 
| 950 | 
         
            +
              "mechanise": "mechanize",
         
     | 
| 951 | 
         
            +
              "mechanised": "mechanized",
         
     | 
| 952 | 
         
            +
              "mechanises": "mechanizes",
         
     | 
| 953 | 
         
            +
              "mechanising": "mechanizing",
         
     | 
| 954 | 
         
            +
              "mediaeval": "medieval",
         
     | 
| 955 | 
         
            +
              "memorialise": "memorialize",
         
     | 
| 956 | 
         
            +
              "memorialised": "memorialized",
         
     | 
| 957 | 
         
            +
              "memorialises": "memorializes",
         
     | 
| 958 | 
         
            +
              "memorialising": "memorializing",
         
     | 
| 959 | 
         
            +
              "memorise": "memorize",
         
     | 
| 960 | 
         
            +
              "memorised": "memorized",
         
     | 
| 961 | 
         
            +
              "memorises": "memorizes",
         
     | 
| 962 | 
         
            +
              "memorising": "memorizing",
         
     | 
| 963 | 
         
            +
              "mesmerise": "mesmerize",
         
     | 
| 964 | 
         
            +
              "mesmerised": "mesmerized",
         
     | 
| 965 | 
         
            +
              "mesmerises": "mesmerizes",
         
     | 
| 966 | 
         
            +
              "mesmerising": "mesmerizing",
         
     | 
| 967 | 
         
            +
              "metabolise": "metabolize",
         
     | 
| 968 | 
         
            +
              "metabolised": "metabolized",
         
     | 
| 969 | 
         
            +
              "metabolises": "metabolizes",
         
     | 
| 970 | 
         
            +
              "metabolising": "metabolizing",
         
     | 
| 971 | 
         
            +
              "metre": "meter",
         
     | 
| 972 | 
         
            +
              "metres": "meters",
         
     | 
| 973 | 
         
            +
              "mhm": "hmm",
         
     | 
| 974 | 
         
            +
              "micrometre": "micrometer",
         
     | 
| 975 | 
         
            +
              "micrometres": "micrometers",
         
     | 
| 976 | 
         
            +
              "militarise": "militarize",
         
     | 
| 977 | 
         
            +
              "militarised": "militarized",
         
     | 
| 978 | 
         
            +
              "militarises": "militarizes",
         
     | 
| 979 | 
         
            +
              "militarising": "militarizing",
         
     | 
| 980 | 
         
            +
              "milligramme": "milligram",
         
     | 
| 981 | 
         
            +
              "milligrammes": "milligrams",
         
     | 
| 982 | 
         
            +
              "millilitre": "milliliter",
         
     | 
| 983 | 
         
            +
              "millilitres": "milliliters",
         
     | 
| 984 | 
         
            +
              "millimetre": "millimeter",
         
     | 
| 985 | 
         
            +
              "millimetres": "millimeters",
         
     | 
| 986 | 
         
            +
              "miniaturisation": "miniaturization",
         
     | 
| 987 | 
         
            +
              "miniaturise": "miniaturize",
         
     | 
| 988 | 
         
            +
              "miniaturised": "miniaturized",
         
     | 
| 989 | 
         
            +
              "miniaturises": "miniaturizes",
         
     | 
| 990 | 
         
            +
              "miniaturising": "miniaturizing",
         
     | 
| 991 | 
         
            +
              "minibusses": "minibuses",
         
     | 
| 992 | 
         
            +
              "minimise": "minimize",
         
     | 
| 993 | 
         
            +
              "minimised": "minimized",
         
     | 
| 994 | 
         
            +
              "minimises": "minimizes",
         
     | 
| 995 | 
         
            +
              "minimising": "minimizing",
         
     | 
| 996 | 
         
            +
              "misbehaviour": "misbehavior",
         
     | 
| 997 | 
         
            +
              "misdemeanour": "misdemeanor",
         
     | 
| 998 | 
         
            +
              "misdemeanours": "misdemeanors",
         
     | 
| 999 | 
         
            +
              "misspelt": "misspelled",
         
     | 
| 1000 | 
         
            +
              "mitre": "miter",
         
     | 
| 1001 | 
         
            +
              "mitres": "miters",
         
     | 
| 1002 | 
         
            +
              "mm": "hmm",
         
     | 
| 1003 | 
         
            +
              "mmm": "hmm",
         
     | 
| 1004 | 
         
            +
              "mobilisation": "mobilization",
         
     | 
| 1005 | 
         
            +
              "mobilise": "mobilize",
         
     | 
| 1006 | 
         
            +
              "mobilised": "mobilized",
         
     | 
| 1007 | 
         
            +
              "mobilises": "mobilizes",
         
     | 
| 1008 | 
         
            +
              "mobilising": "mobilizing",
         
     | 
| 1009 | 
         
            +
              "modelled": "modeled",
         
     | 
| 1010 | 
         
            +
              "modeller": "modeler",
         
     | 
| 1011 | 
         
            +
              "modellers": "modelers",
         
     | 
| 1012 | 
         
            +
              "modelling": "modeling",
         
     | 
| 1013 | 
         
            +
              "modernise": "modernize",
         
     | 
| 1014 | 
         
            +
              "modernised": "modernized",
         
     | 
| 1015 | 
         
            +
              "modernises": "modernizes",
         
     | 
| 1016 | 
         
            +
              "modernising": "modernizing",
         
     | 
| 1017 | 
         
            +
              "moisturise": "moisturize",
         
     | 
| 1018 | 
         
            +
              "moisturised": "moisturized",
         
     | 
| 1019 | 
         
            +
              "moisturiser": "moisturizer",
         
     | 
| 1020 | 
         
            +
              "moisturisers": "moisturizers",
         
     | 
| 1021 | 
         
            +
              "moisturises": "moisturizes",
         
     | 
| 1022 | 
         
            +
              "moisturising": "moisturizing",
         
     | 
| 1023 | 
         
            +
              "monologue": "monolog",
         
     | 
| 1024 | 
         
            +
              "monologues": "monologs",
         
     | 
| 1025 | 
         
            +
              "monopolisation": "monopolization",
         
     | 
| 1026 | 
         
            +
              "monopolise": "monopolize",
         
     | 
| 1027 | 
         
            +
              "monopolised": "monopolized",
         
     | 
| 1028 | 
         
            +
              "monopolises": "monopolizes",
         
     | 
| 1029 | 
         
            +
              "monopolising": "monopolizing",
         
     | 
| 1030 | 
         
            +
              "moralise": "moralize",
         
     | 
| 1031 | 
         
            +
              "moralised": "moralized",
         
     | 
| 1032 | 
         
            +
              "moralises": "moralizes",
         
     | 
| 1033 | 
         
            +
              "moralising": "moralizing",
         
     | 
| 1034 | 
         
            +
              "motorised": "motorized",
         
     | 
| 1035 | 
         
            +
              "mould": "mold",
         
     | 
| 1036 | 
         
            +
              "moulded": "molded",
         
     | 
| 1037 | 
         
            +
              "moulder": "molder",
         
     | 
| 1038 | 
         
            +
              "mouldered": "moldered",
         
     | 
| 1039 | 
         
            +
              "mouldering": "moldering",
         
     | 
| 1040 | 
         
            +
              "moulders": "molders",
         
     | 
| 1041 | 
         
            +
              "mouldier": "moldier",
         
     | 
| 1042 | 
         
            +
              "mouldiest": "moldiest",
         
     | 
| 1043 | 
         
            +
              "moulding": "molding",
         
     | 
| 1044 | 
         
            +
              "mouldings": "moldings",
         
     | 
| 1045 | 
         
            +
              "moulds": "molds",
         
     | 
| 1046 | 
         
            +
              "mouldy": "moldy",
         
     | 
| 1047 | 
         
            +
              "moult": "molt",
         
     | 
| 1048 | 
         
            +
              "moulted": "molted",
         
     | 
| 1049 | 
         
            +
              "moulting": "molting",
         
     | 
| 1050 | 
         
            +
              "moults": "molts",
         
     | 
| 1051 | 
         
            +
              "moustache": "mustache",
         
     | 
| 1052 | 
         
            +
              "moustached": "mustached",
         
     | 
| 1053 | 
         
            +
              "moustaches": "mustaches",
         
     | 
| 1054 | 
         
            +
              "moustachioed": "mustachioed",
         
     | 
| 1055 | 
         
            +
              "multicoloured": "multicolored",
         
     | 
| 1056 | 
         
            +
              "nationalisation": "nationalization",
         
     | 
| 1057 | 
         
            +
              "nationalisations": "nationalizations",
         
     | 
| 1058 | 
         
            +
              "nationalise": "nationalize",
         
     | 
| 1059 | 
         
            +
              "nationalised": "nationalized",
         
     | 
| 1060 | 
         
            +
              "nationalises": "nationalizes",
         
     | 
| 1061 | 
         
            +
              "nationalising": "nationalizing",
         
     | 
| 1062 | 
         
            +
              "naturalisation": "naturalization",
         
     | 
| 1063 | 
         
            +
              "naturalise": "naturalize",
         
     | 
| 1064 | 
         
            +
              "naturalised": "naturalized",
         
     | 
| 1065 | 
         
            +
              "naturalises": "naturalizes",
         
     | 
| 1066 | 
         
            +
              "naturalising": "naturalizing",
         
     | 
| 1067 | 
         
            +
              "neighbour": "neighbor",
         
     | 
| 1068 | 
         
            +
              "neighbourhood": "neighborhood",
         
     | 
| 1069 | 
         
            +
              "neighbourhoods": "neighborhoods",
         
     | 
| 1070 | 
         
            +
              "neighbouring": "neighboring",
         
     | 
| 1071 | 
         
            +
              "neighbourliness": "neighborliness",
         
     | 
| 1072 | 
         
            +
              "neighbourly": "neighborly",
         
     | 
| 1073 | 
         
            +
              "neighbours": "neighbors",
         
     | 
| 1074 | 
         
            +
              "neutralisation": "neutralization",
         
     | 
| 1075 | 
         
            +
              "neutralise": "neutralize",
         
     | 
| 1076 | 
         
            +
              "neutralised": "neutralized",
         
     | 
| 1077 | 
         
            +
              "neutralises": "neutralizes",
         
     | 
| 1078 | 
         
            +
              "neutralising": "neutralizing",
         
     | 
| 1079 | 
         
            +
              "normalisation": "normalization",
         
     | 
| 1080 | 
         
            +
              "normalise": "normalize",
         
     | 
| 1081 | 
         
            +
              "normalised": "normalized",
         
     | 
| 1082 | 
         
            +
              "normalises": "normalizes",
         
     | 
| 1083 | 
         
            +
              "normalising": "normalizing",
         
     | 
| 1084 | 
         
            +
              "odour": "odor",
         
     | 
| 1085 | 
         
            +
              "odourless": "odorless",
         
     | 
| 1086 | 
         
            +
              "odours": "odors",
         
     | 
| 1087 | 
         
            +
              "oesophagus": "esophagus",
         
     | 
| 1088 | 
         
            +
              "oesophaguses": "esophaguses",
         
     | 
| 1089 | 
         
            +
              "oestrogen": "estrogen",
         
     | 
| 1090 | 
         
            +
              "offence": "offense",
         
     | 
| 1091 | 
         
            +
              "offences": "offenses",
         
     | 
| 1092 | 
         
            +
              "omelette": "omelet",
         
     | 
| 1093 | 
         
            +
              "omelettes": "omelets",
         
     | 
| 1094 | 
         
            +
              "optimise": "optimize",
         
     | 
| 1095 | 
         
            +
              "optimised": "optimized",
         
     | 
| 1096 | 
         
            +
              "optimises": "optimizes",
         
     | 
| 1097 | 
         
            +
              "optimising": "optimizing",
         
     | 
| 1098 | 
         
            +
              "organisation": "organization",
         
     | 
| 1099 | 
         
            +
              "organisational": "organizational",
         
     | 
| 1100 | 
         
            +
              "organisations": "organizations",
         
     | 
| 1101 | 
         
            +
              "organise": "organize",
         
     | 
| 1102 | 
         
            +
              "organised": "organized",
         
     | 
| 1103 | 
         
            +
              "organiser": "organizer",
         
     | 
| 1104 | 
         
            +
              "organisers": "organizers",
         
     | 
| 1105 | 
         
            +
              "organises": "organizes",
         
     | 
| 1106 | 
         
            +
              "organising": "organizing",
         
     | 
| 1107 | 
         
            +
              "orthopaedic": "orthopedic",
         
     | 
| 1108 | 
         
            +
              "orthopaedics": "orthopedics",
         
     | 
| 1109 | 
         
            +
              "ostracise": "ostracize",
         
     | 
| 1110 | 
         
            +
              "ostracised": "ostracized",
         
     | 
| 1111 | 
         
            +
              "ostracises": "ostracizes",
         
     | 
| 1112 | 
         
            +
              "ostracising": "ostracizing",
         
     | 
| 1113 | 
         
            +
              "outmanoeuvre": "outmaneuver",
         
     | 
| 1114 | 
         
            +
              "outmanoeuvred": "outmaneuvered",
         
     | 
| 1115 | 
         
            +
              "outmanoeuvres": "outmaneuvers",
         
     | 
| 1116 | 
         
            +
              "outmanoeuvring": "outmaneuvering",
         
     | 
| 1117 | 
         
            +
              "overemphasise": "overemphasize",
         
     | 
| 1118 | 
         
            +
              "overemphasised": "overemphasized",
         
     | 
| 1119 | 
         
            +
              "overemphasises": "overemphasizes",
         
     | 
| 1120 | 
         
            +
              "overemphasising": "overemphasizing",
         
     | 
| 1121 | 
         
            +
              "oxidisation": "oxidization",
         
     | 
| 1122 | 
         
            +
              "oxidise": "oxidize",
         
     | 
| 1123 | 
         
            +
              "oxidised": "oxidized",
         
     | 
| 1124 | 
         
            +
              "oxidises": "oxidizes",
         
     | 
| 1125 | 
         
            +
              "oxidising": "oxidizing",
         
     | 
| 1126 | 
         
            +
              "paederast": "pederast",
         
     | 
| 1127 | 
         
            +
              "paederasts": "pederasts",
         
     | 
| 1128 | 
         
            +
              "paediatric": "pediatric",
         
     | 
| 1129 | 
         
            +
              "paediatrician": "pediatrician",
         
     | 
| 1130 | 
         
            +
              "paediatricians": "pediatricians",
         
     | 
| 1131 | 
         
            +
              "paediatrics": "pediatrics",
         
     | 
| 1132 | 
         
            +
              "paedophile": "pedophile",
         
     | 
| 1133 | 
         
            +
              "paedophiles": "pedophiles",
         
     | 
| 1134 | 
         
            +
              "paedophilia": "pedophilia",
         
     | 
| 1135 | 
         
            +
              "palaeolithic": "paleolithic",
         
     | 
| 1136 | 
         
            +
              "palaeontologist": "paleontologist",
         
     | 
| 1137 | 
         
            +
              "palaeontologists": "paleontologists",
         
     | 
| 1138 | 
         
            +
              "palaeontology": "paleontology",
         
     | 
| 1139 | 
         
            +
              "panelled": "paneled",
         
     | 
| 1140 | 
         
            +
              "panelling": "paneling",
         
     | 
| 1141 | 
         
            +
              "panellist": "panelist",
         
     | 
| 1142 | 
         
            +
              "panellists": "panelists",
         
     | 
| 1143 | 
         
            +
              "paralyse": "paralyze",
         
     | 
| 1144 | 
         
            +
              "paralysed": "paralyzed",
         
     | 
| 1145 | 
         
            +
              "paralyses": "paralyzes",
         
     | 
| 1146 | 
         
            +
              "paralysing": "paralyzing",
         
     | 
| 1147 | 
         
            +
              "parcelled": "parceled",
         
     | 
| 1148 | 
         
            +
              "parcelling": "parceling",
         
     | 
| 1149 | 
         
            +
              "parlour": "parlor",
         
     | 
| 1150 | 
         
            +
              "parlours": "parlors",
         
     | 
| 1151 | 
         
            +
              "particularise": "particularize",
         
     | 
| 1152 | 
         
            +
              "particularised": "particularized",
         
     | 
| 1153 | 
         
            +
              "particularises": "particularizes",
         
     | 
| 1154 | 
         
            +
              "particularising": "particularizing",
         
     | 
| 1155 | 
         
            +
              "passivisation": "passivization",
         
     | 
| 1156 | 
         
            +
              "passivise": "passivize",
         
     | 
| 1157 | 
         
            +
              "passivised": "passivized",
         
     | 
| 1158 | 
         
            +
              "passivises": "passivizes",
         
     | 
| 1159 | 
         
            +
              "passivising": "passivizing",
         
     | 
| 1160 | 
         
            +
              "pasteurisation": "pasteurization",
         
     | 
| 1161 | 
         
            +
              "pasteurise": "pasteurize",
         
     | 
| 1162 | 
         
            +
              "pasteurised": "pasteurized",
         
     | 
| 1163 | 
         
            +
              "pasteurises": "pasteurizes",
         
     | 
| 1164 | 
         
            +
              "pasteurising": "pasteurizing",
         
     | 
| 1165 | 
         
            +
              "patronise": "patronize",
         
     | 
| 1166 | 
         
            +
              "patronised": "patronized",
         
     | 
| 1167 | 
         
            +
              "patronises": "patronizes",
         
     | 
| 1168 | 
         
            +
              "patronising": "patronizing",
         
     | 
| 1169 | 
         
            +
              "patronisingly": "patronizingly",
         
     | 
| 1170 | 
         
            +
              "pedalled": "pedaled",
         
     | 
| 1171 | 
         
            +
              "pedalling": "pedaling",
         
     | 
| 1172 | 
         
            +
              "pedestrianisation": "pedestrianization",
         
     | 
| 1173 | 
         
            +
              "pedestrianise": "pedestrianize",
         
     | 
| 1174 | 
         
            +
              "pedestrianised": "pedestrianized",
         
     | 
| 1175 | 
         
            +
              "pedestrianises": "pedestrianizes",
         
     | 
| 1176 | 
         
            +
              "pedestrianising": "pedestrianizing",
         
     | 
| 1177 | 
         
            +
              "penalise": "penalize",
         
     | 
| 1178 | 
         
            +
              "penalised": "penalized",
         
     | 
| 1179 | 
         
            +
              "penalises": "penalizes",
         
     | 
| 1180 | 
         
            +
              "penalising": "penalizing",
         
     | 
| 1181 | 
         
            +
              "pencilled": "penciled",
         
     | 
| 1182 | 
         
            +
              "pencilling": "penciling",
         
     | 
| 1183 | 
         
            +
              "personalise": "personalize",
         
     | 
| 1184 | 
         
            +
              "personalised": "personalized",
         
     | 
| 1185 | 
         
            +
              "personalises": "personalizes",
         
     | 
| 1186 | 
         
            +
              "personalising": "personalizing",
         
     | 
| 1187 | 
         
            +
              "pharmacopoeia": "pharmacopeia",
         
     | 
| 1188 | 
         
            +
              "pharmacopoeias": "pharmacopeias",
         
     | 
| 1189 | 
         
            +
              "philosophise": "philosophize",
         
     | 
| 1190 | 
         
            +
              "philosophised": "philosophized",
         
     | 
| 1191 | 
         
            +
              "philosophises": "philosophizes",
         
     | 
| 1192 | 
         
            +
              "philosophising": "philosophizing",
         
     | 
| 1193 | 
         
            +
              "philtre": "filter",
         
     | 
| 1194 | 
         
            +
              "philtres": "filters",
         
     | 
| 1195 | 
         
            +
              "phoney": "phony",
         
     | 
| 1196 | 
         
            +
              "plagiarise": "plagiarize",
         
     | 
| 1197 | 
         
            +
              "plagiarised": "plagiarized",
         
     | 
| 1198 | 
         
            +
              "plagiarises": "plagiarizes",
         
     | 
| 1199 | 
         
            +
              "plagiarising": "plagiarizing",
         
     | 
| 1200 | 
         
            +
              "plough": "plow",
         
     | 
| 1201 | 
         
            +
              "ploughed": "plowed",
         
     | 
| 1202 | 
         
            +
              "ploughing": "plowing",
         
     | 
| 1203 | 
         
            +
              "ploughman": "plowman",
         
     | 
| 1204 | 
         
            +
              "ploughmen": "plowmen",
         
     | 
| 1205 | 
         
            +
              "ploughs": "plows",
         
     | 
| 1206 | 
         
            +
              "ploughshare": "plowshare",
         
     | 
| 1207 | 
         
            +
              "ploughshares": "plowshares",
         
     | 
| 1208 | 
         
            +
              "polarisation": "polarization",
         
     | 
| 1209 | 
         
            +
              "polarise": "polarize",
         
     | 
| 1210 | 
         
            +
              "polarised": "polarized",
         
     | 
| 1211 | 
         
            +
              "polarises": "polarizes",
         
     | 
| 1212 | 
         
            +
              "polarising": "polarizing",
         
     | 
| 1213 | 
         
            +
              "politicisation": "politicization",
         
     | 
| 1214 | 
         
            +
              "politicise": "politicize",
         
     | 
| 1215 | 
         
            +
              "politicised": "politicized",
         
     | 
| 1216 | 
         
            +
              "politicises": "politicizes",
         
     | 
| 1217 | 
         
            +
              "politicising": "politicizing",
         
     | 
| 1218 | 
         
            +
              "popularisation": "popularization",
         
     | 
| 1219 | 
         
            +
              "popularise": "popularize",
         
     | 
| 1220 | 
         
            +
              "popularised": "popularized",
         
     | 
| 1221 | 
         
            +
              "popularises": "popularizes",
         
     | 
| 1222 | 
         
            +
              "popularising": "popularizing",
         
     | 
| 1223 | 
         
            +
              "pouffe": "pouf",
         
     | 
| 1224 | 
         
            +
              "pouffes": "poufs",
         
     | 
| 1225 | 
         
            +
              "practise": "practice",
         
     | 
| 1226 | 
         
            +
              "practised": "practiced",
         
     | 
| 1227 | 
         
            +
              "practises": "practices",
         
     | 
| 1228 | 
         
            +
              "practising": "practicing",
         
     | 
| 1229 | 
         
            +
              "praesidium": "presidium",
         
     | 
| 1230 | 
         
            +
              "praesidiums": "presidiums",
         
     | 
| 1231 | 
         
            +
              "pressurisation": "pressurization",
         
     | 
| 1232 | 
         
            +
              "pressurise": "pressurize",
         
     | 
| 1233 | 
         
            +
              "pressurised": "pressurized",
         
     | 
| 1234 | 
         
            +
              "pressurises": "pressurizes",
         
     | 
| 1235 | 
         
            +
              "pressurising": "pressurizing",
         
     | 
| 1236 | 
         
            +
              "pretence": "pretense",
         
     | 
| 1237 | 
         
            +
              "pretences": "pretenses",
         
     | 
| 1238 | 
         
            +
              "primaeval": "primeval",
         
     | 
| 1239 | 
         
            +
              "prioritisation": "prioritization",
         
     | 
| 1240 | 
         
            +
              "prioritise": "prioritize",
         
     | 
| 1241 | 
         
            +
              "prioritised": "prioritized",
         
     | 
| 1242 | 
         
            +
              "prioritises": "prioritizes",
         
     | 
| 1243 | 
         
            +
              "prioritising": "prioritizing",
         
     | 
| 1244 | 
         
            +
              "privatisation": "privatization",
         
     | 
| 1245 | 
         
            +
              "privatisations": "privatizations",
         
     | 
| 1246 | 
         
            +
              "privatise": "privatize",
         
     | 
| 1247 | 
         
            +
              "privatised": "privatized",
         
     | 
| 1248 | 
         
            +
              "privatises": "privatizes",
         
     | 
| 1249 | 
         
            +
              "privatising": "privatizing",
         
     | 
| 1250 | 
         
            +
              "professionalisation": "professionalization",
         
     | 
| 1251 | 
         
            +
              "professionalise": "professionalize",
         
     | 
| 1252 | 
         
            +
              "professionalised": "professionalized",
         
     | 
| 1253 | 
         
            +
              "professionalises": "professionalizes",
         
     | 
| 1254 | 
         
            +
              "professionalising": "professionalizing",
         
     | 
| 1255 | 
         
            +
              "programme": "program",
         
     | 
| 1256 | 
         
            +
              "programmes": "programs",
         
     | 
| 1257 | 
         
            +
              "prologue": "prolog",
         
     | 
| 1258 | 
         
            +
              "prologues": "prologs",
         
     | 
| 1259 | 
         
            +
              "propagandise": "propagandize",
         
     | 
| 1260 | 
         
            +
              "propagandised": "propagandized",
         
     | 
| 1261 | 
         
            +
              "propagandises": "propagandizes",
         
     | 
| 1262 | 
         
            +
              "propagandising": "propagandizing",
         
     | 
| 1263 | 
         
            +
              "proselytise": "proselytize",
         
     | 
| 1264 | 
         
            +
              "proselytised": "proselytized",
         
     | 
| 1265 | 
         
            +
              "proselytiser": "proselytizer",
         
     | 
| 1266 | 
         
            +
              "proselytisers": "proselytizers",
         
     | 
| 1267 | 
         
            +
              "proselytises": "proselytizes",
         
     | 
| 1268 | 
         
            +
              "proselytising": "proselytizing",
         
     | 
| 1269 | 
         
            +
              "psychoanalyse": "psychoanalyze",
         
     | 
| 1270 | 
         
            +
              "psychoanalysed": "psychoanalyzed",
         
     | 
| 1271 | 
         
            +
              "psychoanalyses": "psychoanalyzes",
         
     | 
| 1272 | 
         
            +
              "psychoanalysing": "psychoanalyzing",
         
     | 
| 1273 | 
         
            +
              "publicise": "publicize",
         
     | 
| 1274 | 
         
            +
              "publicised": "publicized",
         
     | 
| 1275 | 
         
            +
              "publicises": "publicizes",
         
     | 
| 1276 | 
         
            +
              "publicising": "publicizing",
         
     | 
| 1277 | 
         
            +
              "pulverisation": "pulverization",
         
     | 
| 1278 | 
         
            +
              "pulverise": "pulverize",
         
     | 
| 1279 | 
         
            +
              "pulverised": "pulverized",
         
     | 
| 1280 | 
         
            +
              "pulverises": "pulverizes",
         
     | 
| 1281 | 
         
            +
              "pulverising": "pulverizing",
         
     | 
| 1282 | 
         
            +
              "pummelled": "pummel",
         
     | 
| 1283 | 
         
            +
              "pummelling": "pummeled",
         
     | 
| 1284 | 
         
            +
              "pyjama": "pajama",
         
     | 
| 1285 | 
         
            +
              "pyjamas": "pajamas",
         
     | 
| 1286 | 
         
            +
              "pzazz": "pizzazz",
         
     | 
| 1287 | 
         
            +
              "quarrelled": "quarreled",
         
     | 
| 1288 | 
         
            +
              "quarrelling": "quarreling",
         
     | 
| 1289 | 
         
            +
              "radicalise": "radicalize",
         
     | 
| 1290 | 
         
            +
              "radicalised": "radicalized",
         
     | 
| 1291 | 
         
            +
              "radicalises": "radicalizes",
         
     | 
| 1292 | 
         
            +
              "radicalising": "radicalizing",
         
     | 
| 1293 | 
         
            +
              "rancour": "rancor",
         
     | 
| 1294 | 
         
            +
              "randomise": "randomize",
         
     | 
| 1295 | 
         
            +
              "randomised": "randomized",
         
     | 
| 1296 | 
         
            +
              "randomises": "randomizes",
         
     | 
| 1297 | 
         
            +
              "randomising": "randomizing",
         
     | 
| 1298 | 
         
            +
              "rationalisation": "rationalization",
         
     | 
| 1299 | 
         
            +
              "rationalisations": "rationalizations",
         
     | 
| 1300 | 
         
            +
              "rationalise": "rationalize",
         
     | 
| 1301 | 
         
            +
              "rationalised": "rationalized",
         
     | 
| 1302 | 
         
            +
              "rationalises": "rationalizes",
         
     | 
| 1303 | 
         
            +
              "rationalising": "rationalizing",
         
     | 
| 1304 | 
         
            +
              "ravelled": "raveled",
         
     | 
| 1305 | 
         
            +
              "ravelling": "raveling",
         
     | 
| 1306 | 
         
            +
              "realisable": "realizable",
         
     | 
| 1307 | 
         
            +
              "realisation": "realization",
         
     | 
| 1308 | 
         
            +
              "realisations": "realizations",
         
     | 
| 1309 | 
         
            +
              "realise": "realize",
         
     | 
| 1310 | 
         
            +
              "realised": "realized",
         
     | 
| 1311 | 
         
            +
              "realises": "realizes",
         
     | 
| 1312 | 
         
            +
              "realising": "realizing",
         
     | 
| 1313 | 
         
            +
              "recognisable": "recognizable",
         
     | 
| 1314 | 
         
            +
              "recognisably": "recognizably",
         
     | 
| 1315 | 
         
            +
              "recognisance": "recognizance",
         
     | 
| 1316 | 
         
            +
              "recognise": "recognize",
         
     | 
| 1317 | 
         
            +
              "recognised": "recognized",
         
     | 
| 1318 | 
         
            +
              "recognises": "recognizes",
         
     | 
| 1319 | 
         
            +
              "recognising": "recognizing",
         
     | 
| 1320 | 
         
            +
              "reconnoitre": "reconnoiter",
         
     | 
| 1321 | 
         
            +
              "reconnoitred": "reconnoitered",
         
     | 
| 1322 | 
         
            +
              "reconnoitres": "reconnoiters",
         
     | 
| 1323 | 
         
            +
              "reconnoitring": "reconnoitering",
         
     | 
| 1324 | 
         
            +
              "refuelled": "refueled",
         
     | 
| 1325 | 
         
            +
              "refuelling": "refueling",
         
     | 
| 1326 | 
         
            +
              "regularisation": "regularization",
         
     | 
| 1327 | 
         
            +
              "regularise": "regularize",
         
     | 
| 1328 | 
         
            +
              "regularised": "regularized",
         
     | 
| 1329 | 
         
            +
              "regularises": "regularizes",
         
     | 
| 1330 | 
         
            +
              "regularising": "regularizing",
         
     | 
| 1331 | 
         
            +
              "remodelled": "remodeled",
         
     | 
| 1332 | 
         
            +
              "remodelling": "remodeling",
         
     | 
| 1333 | 
         
            +
              "remould": "remold",
         
     | 
| 1334 | 
         
            +
              "remoulded": "remolded",
         
     | 
| 1335 | 
         
            +
              "remoulding": "remolding",
         
     | 
| 1336 | 
         
            +
              "remoulds": "remolds",
         
     | 
| 1337 | 
         
            +
              "reorganisation": "reorganization",
         
     | 
| 1338 | 
         
            +
              "reorganisations": "reorganizations",
         
     | 
| 1339 | 
         
            +
              "reorganise": "reorganize",
         
     | 
| 1340 | 
         
            +
              "reorganised": "reorganized",
         
     | 
| 1341 | 
         
            +
              "reorganises": "reorganizes",
         
     | 
| 1342 | 
         
            +
              "reorganising": "reorganizing",
         
     | 
| 1343 | 
         
            +
              "revelled": "reveled",
         
     | 
| 1344 | 
         
            +
              "reveller": "reveler",
         
     | 
| 1345 | 
         
            +
              "revellers": "revelers",
         
     | 
| 1346 | 
         
            +
              "revelling": "reveling",
         
     | 
| 1347 | 
         
            +
              "revitalise": "revitalize",
         
     | 
| 1348 | 
         
            +
              "revitalised": "revitalized",
         
     | 
| 1349 | 
         
            +
              "revitalises": "revitalizes",
         
     | 
| 1350 | 
         
            +
              "revitalising": "revitalizing",
         
     | 
| 1351 | 
         
            +
              "revolutionise": "revolutionize",
         
     | 
| 1352 | 
         
            +
              "revolutionised": "revolutionized",
         
     | 
| 1353 | 
         
            +
              "revolutionises": "revolutionizes",
         
     | 
| 1354 | 
         
            +
              "revolutionising": "revolutionizing",
         
     | 
| 1355 | 
         
            +
              "rhapsodise": "rhapsodize",
         
     | 
| 1356 | 
         
            +
              "rhapsodised": "rhapsodized",
         
     | 
| 1357 | 
         
            +
              "rhapsodises": "rhapsodizes",
         
     | 
| 1358 | 
         
            +
              "rhapsodising": "rhapsodizing",
         
     | 
| 1359 | 
         
            +
              "rigour": "rigor",
         
     | 
| 1360 | 
         
            +
              "rigours": "rigors",
         
     | 
| 1361 | 
         
            +
              "ritualised": "ritualized",
         
     | 
| 1362 | 
         
            +
              "rivalled": "rivaled",
         
     | 
| 1363 | 
         
            +
              "rivalling": "rivaling",
         
     | 
| 1364 | 
         
            +
              "romanticise": "romanticize",
         
     | 
| 1365 | 
         
            +
              "romanticised": "romanticized",
         
     | 
| 1366 | 
         
            +
              "romanticises": "romanticizes",
         
     | 
| 1367 | 
         
            +
              "romanticising": "romanticizing",
         
     | 
| 1368 | 
         
            +
              "rumour": "rumor",
         
     | 
| 1369 | 
         
            +
              "rumoured": "rumored",
         
     | 
| 1370 | 
         
            +
              "rumours": "rumors",
         
     | 
| 1371 | 
         
            +
              "sabre": "saber",
         
     | 
| 1372 | 
         
            +
              "sabres": "sabers",
         
     | 
| 1373 | 
         
            +
              "saltpetre": "saltpeter",
         
     | 
| 1374 | 
         
            +
              "sanitise": "sanitize",
         
     | 
| 1375 | 
         
            +
              "sanitised": "sanitized",
         
     | 
| 1376 | 
         
            +
              "sanitises": "sanitizes",
         
     | 
| 1377 | 
         
            +
              "sanitising": "sanitizing",
         
     | 
| 1378 | 
         
            +
              "satirise": "satirize",
         
     | 
| 1379 | 
         
            +
              "satirised": "satirized",
         
     | 
| 1380 | 
         
            +
              "satirises": "satirizes",
         
     | 
| 1381 | 
         
            +
              "satirising": "satirizing",
         
     | 
| 1382 | 
         
            +
              "saviour": "savior",
         
     | 
| 1383 | 
         
            +
              "saviours": "saviors",
         
     | 
| 1384 | 
         
            +
              "savour": "savor",
         
     | 
| 1385 | 
         
            +
              "savoured": "savored",
         
     | 
| 1386 | 
         
            +
              "savouries": "savories",
         
     | 
| 1387 | 
         
            +
              "savouring": "savoring",
         
     | 
| 1388 | 
         
            +
              "savours": "savors",
         
     | 
| 1389 | 
         
            +
              "savoury": "savory",
         
     | 
| 1390 | 
         
            +
              "scandalise": "scandalize",
         
     | 
| 1391 | 
         
            +
              "scandalised": "scandalized",
         
     | 
| 1392 | 
         
            +
              "scandalises": "scandalizes",
         
     | 
| 1393 | 
         
            +
              "scandalising": "scandalizing",
         
     | 
| 1394 | 
         
            +
              "sceptic": "skeptic",
         
     | 
| 1395 | 
         
            +
              "sceptical": "skeptical",
         
     | 
| 1396 | 
         
            +
              "sceptically": "skeptically",
         
     | 
| 1397 | 
         
            +
              "scepticism": "skepticism",
         
     | 
| 1398 | 
         
            +
              "sceptics": "skeptics",
         
     | 
| 1399 | 
         
            +
              "sceptre": "scepter",
         
     | 
| 1400 | 
         
            +
              "sceptres": "scepters",
         
     | 
| 1401 | 
         
            +
              "scrutinise": "scrutinize",
         
     | 
| 1402 | 
         
            +
              "scrutinised": "scrutinized",
         
     | 
| 1403 | 
         
            +
              "scrutinises": "scrutinizes",
         
     | 
| 1404 | 
         
            +
              "scrutinising": "scrutinizing",
         
     | 
| 1405 | 
         
            +
              "secularisation": "secularization",
         
     | 
| 1406 | 
         
            +
              "secularise": "secularize",
         
     | 
| 1407 | 
         
            +
              "secularised": "secularized",
         
     | 
| 1408 | 
         
            +
              "secularises": "secularizes",
         
     | 
| 1409 | 
         
            +
              "secularising": "secularizing",
         
     | 
| 1410 | 
         
            +
              "sensationalise": "sensationalize",
         
     | 
| 1411 | 
         
            +
              "sensationalised": "sensationalized",
         
     | 
| 1412 | 
         
            +
              "sensationalises": "sensationalizes",
         
     | 
| 1413 | 
         
            +
              "sensationalising": "sensationalizing",
         
     | 
| 1414 | 
         
            +
              "sensitise": "sensitize",
         
     | 
| 1415 | 
         
            +
              "sensitised": "sensitized",
         
     | 
| 1416 | 
         
            +
              "sensitises": "sensitizes",
         
     | 
| 1417 | 
         
            +
              "sensitising": "sensitizing",
         
     | 
| 1418 | 
         
            +
              "sentimentalise": "sentimentalize",
         
     | 
| 1419 | 
         
            +
              "sentimentalised": "sentimentalized",
         
     | 
| 1420 | 
         
            +
              "sentimentalises": "sentimentalizes",
         
     | 
| 1421 | 
         
            +
              "sentimentalising": "sentimentalizing",
         
     | 
| 1422 | 
         
            +
              "sepulchre": "sepulcher",
         
     | 
| 1423 | 
         
            +
              "sepulchres": "sepulchers",
         
     | 
| 1424 | 
         
            +
              "serialisation": "serialization",
         
     | 
| 1425 | 
         
            +
              "serialisations": "serializations",
         
     | 
| 1426 | 
         
            +
              "serialise": "serialize",
         
     | 
| 1427 | 
         
            +
              "serialised": "serialized",
         
     | 
| 1428 | 
         
            +
              "serialises": "serializes",
         
     | 
| 1429 | 
         
            +
              "serialising": "serializing",
         
     | 
| 1430 | 
         
            +
              "sermonise": "sermonize",
         
     | 
| 1431 | 
         
            +
              "sermonised": "sermonized",
         
     | 
| 1432 | 
         
            +
              "sermonises": "sermonizes",
         
     | 
| 1433 | 
         
            +
              "sermonising": "sermonizing",
         
     | 
| 1434 | 
         
            +
              "sheikh": "sheik",
         
     | 
| 1435 | 
         
            +
              "shovelled": "shoveled",
         
     | 
| 1436 | 
         
            +
              "shovelling": "shoveling",
         
     | 
| 1437 | 
         
            +
              "shrivelled": "shriveled",
         
     | 
| 1438 | 
         
            +
              "shrivelling": "shriveling",
         
     | 
| 1439 | 
         
            +
              "signalise": "signalize",
         
     | 
| 1440 | 
         
            +
              "signalised": "signalized",
         
     | 
| 1441 | 
         
            +
              "signalises": "signalizes",
         
     | 
| 1442 | 
         
            +
              "signalising": "signalizing",
         
     | 
| 1443 | 
         
            +
              "signalled": "signaled",
         
     | 
| 1444 | 
         
            +
              "signalling": "signaling",
         
     | 
| 1445 | 
         
            +
              "smoulder": "smolder",
         
     | 
| 1446 | 
         
            +
              "smouldered": "smoldered",
         
     | 
| 1447 | 
         
            +
              "smouldering": "smoldering",
         
     | 
| 1448 | 
         
            +
              "smoulders": "smolders",
         
     | 
| 1449 | 
         
            +
              "snivelled": "sniveled",
         
     | 
| 1450 | 
         
            +
              "snivelling": "sniveling",
         
     | 
| 1451 | 
         
            +
              "snorkelled": "snorkeled",
         
     | 
| 1452 | 
         
            +
              "snorkelling": "snorkeling",
         
     | 
| 1453 | 
         
            +
              "snowplough": "snowplow",
         
     | 
| 1454 | 
         
            +
              "snowploughs": "snowplow",
         
     | 
| 1455 | 
         
            +
              "socialisation": "socialization",
         
     | 
| 1456 | 
         
            +
              "socialise": "socialize",
         
     | 
| 1457 | 
         
            +
              "socialised": "socialized",
         
     | 
| 1458 | 
         
            +
              "socialises": "socializes",
         
     | 
| 1459 | 
         
            +
              "socialising": "socializing",
         
     | 
| 1460 | 
         
            +
              "sodomise": "sodomize",
         
     | 
| 1461 | 
         
            +
              "sodomised": "sodomized",
         
     | 
| 1462 | 
         
            +
              "sodomises": "sodomizes",
         
     | 
| 1463 | 
         
            +
              "sodomising": "sodomizing",
         
     | 
| 1464 | 
         
            +
              "solemnise": "solemnize",
         
     | 
| 1465 | 
         
            +
              "solemnised": "solemnized",
         
     | 
| 1466 | 
         
            +
              "solemnises": "solemnizes",
         
     | 
| 1467 | 
         
            +
              "solemnising": "solemnizing",
         
     | 
| 1468 | 
         
            +
              "sombre": "somber",
         
     | 
| 1469 | 
         
            +
              "specialisation": "specialization",
         
     | 
| 1470 | 
         
            +
              "specialisations": "specializations",
         
     | 
| 1471 | 
         
            +
              "specialise": "specialize",
         
     | 
| 1472 | 
         
            +
              "specialised": "specialized",
         
     | 
| 1473 | 
         
            +
              "specialises": "specializes",
         
     | 
| 1474 | 
         
            +
              "specialising": "specializing",
         
     | 
| 1475 | 
         
            +
              "spectre": "specter",
         
     | 
| 1476 | 
         
            +
              "spectres": "specters",
         
     | 
| 1477 | 
         
            +
              "spiralled": "spiraled",
         
     | 
| 1478 | 
         
            +
              "spiralling": "spiraling",
         
     | 
| 1479 | 
         
            +
              "splendour": "splendor",
         
     | 
| 1480 | 
         
            +
              "splendours": "splendors",
         
     | 
| 1481 | 
         
            +
              "squirrelled": "squirreled",
         
     | 
| 1482 | 
         
            +
              "squirrelling": "squirreling",
         
     | 
| 1483 | 
         
            +
              "stabilisation": "stabilization",
         
     | 
| 1484 | 
         
            +
              "stabilise": "stabilize",
         
     | 
| 1485 | 
         
            +
              "stabilised": "stabilized",
         
     | 
| 1486 | 
         
            +
              "stabiliser": "stabilizer",
         
     | 
| 1487 | 
         
            +
              "stabilisers": "stabilizers",
         
     | 
| 1488 | 
         
            +
              "stabilises": "stabilizes",
         
     | 
| 1489 | 
         
            +
              "stabilising": "stabilizing",
         
     | 
| 1490 | 
         
            +
              "standardisation": "standardization",
         
     | 
| 1491 | 
         
            +
              "standardise": "standardize",
         
     | 
| 1492 | 
         
            +
              "standardised": "standardized",
         
     | 
| 1493 | 
         
            +
              "standardises": "standardizes",
         
     | 
| 1494 | 
         
            +
              "standardising": "standardizing",
         
     | 
| 1495 | 
         
            +
              "stencilled": "stenciled",
         
     | 
| 1496 | 
         
            +
              "stencilling": "stenciling",
         
     | 
| 1497 | 
         
            +
              "sterilisation": "sterilization",
         
     | 
| 1498 | 
         
            +
              "sterilisations": "sterilizations",
         
     | 
| 1499 | 
         
            +
              "sterilise": "sterilize",
         
     | 
| 1500 | 
         
            +
              "sterilised": "sterilized",
         
     | 
| 1501 | 
         
            +
              "steriliser": "sterilizer",
         
     | 
| 1502 | 
         
            +
              "sterilisers": "sterilizers",
         
     | 
| 1503 | 
         
            +
              "sterilises": "sterilizes",
         
     | 
| 1504 | 
         
            +
              "sterilising": "sterilizing",
         
     | 
| 1505 | 
         
            +
              "stigmatisation": "stigmatization",
         
     | 
| 1506 | 
         
            +
              "stigmatise": "stigmatize",
         
     | 
| 1507 | 
         
            +
              "stigmatised": "stigmatized",
         
     | 
| 1508 | 
         
            +
              "stigmatises": "stigmatizes",
         
     | 
| 1509 | 
         
            +
              "stigmatising": "stigmatizing",
         
     | 
| 1510 | 
         
            +
              "storey": "story",
         
     | 
| 1511 | 
         
            +
              "storeys": "stories",
         
     | 
| 1512 | 
         
            +
              "subsidisation": "subsidization",
         
     | 
| 1513 | 
         
            +
              "subsidise": "subsidize",
         
     | 
| 1514 | 
         
            +
              "subsidised": "subsidized",
         
     | 
| 1515 | 
         
            +
              "subsidiser": "subsidizer",
         
     | 
| 1516 | 
         
            +
              "subsidisers": "subsidizers",
         
     | 
| 1517 | 
         
            +
              "subsidises": "subsidizes",
         
     | 
| 1518 | 
         
            +
              "subsidising": "subsidizing",
         
     | 
| 1519 | 
         
            +
              "succour": "succor",
         
     | 
| 1520 | 
         
            +
              "succoured": "succored",
         
     | 
| 1521 | 
         
            +
              "succouring": "succoring",
         
     | 
| 1522 | 
         
            +
              "succours": "succors",
         
     | 
| 1523 | 
         
            +
              "sulphate": "sulfate",
         
     | 
| 1524 | 
         
            +
              "sulphates": "sulfates",
         
     | 
| 1525 | 
         
            +
              "sulphide": "sulfide",
         
     | 
| 1526 | 
         
            +
              "sulphides": "sulfides",
         
     | 
| 1527 | 
         
            +
              "sulphur": "sulfur",
         
     | 
| 1528 | 
         
            +
              "sulphurous": "sulfurous",
         
     | 
| 1529 | 
         
            +
              "summarise": "summarize",
         
     | 
| 1530 | 
         
            +
              "summarised": "summarized",
         
     | 
| 1531 | 
         
            +
              "summarises": "summarizes",
         
     | 
| 1532 | 
         
            +
              "summarising": "summarizing",
         
     | 
| 1533 | 
         
            +
              "swivelled": "swiveled",
         
     | 
| 1534 | 
         
            +
              "swivelling": "swiveling",
         
     | 
| 1535 | 
         
            +
              "symbolise": "symbolize",
         
     | 
| 1536 | 
         
            +
              "symbolised": "symbolized",
         
     | 
| 1537 | 
         
            +
              "symbolises": "symbolizes",
         
     | 
| 1538 | 
         
            +
              "symbolising": "symbolizing",
         
     | 
| 1539 | 
         
            +
              "sympathise": "sympathize",
         
     | 
| 1540 | 
         
            +
              "sympathised": "sympathized",
         
     | 
| 1541 | 
         
            +
              "sympathiser": "sympathizer",
         
     | 
| 1542 | 
         
            +
              "sympathisers": "sympathizers",
         
     | 
| 1543 | 
         
            +
              "sympathises": "sympathizes",
         
     | 
| 1544 | 
         
            +
              "sympathising": "sympathizing",
         
     | 
| 1545 | 
         
            +
              "synchronisation": "synchronization",
         
     | 
| 1546 | 
         
            +
              "synchronise": "synchronize",
         
     | 
| 1547 | 
         
            +
              "synchronised": "synchronized",
         
     | 
| 1548 | 
         
            +
              "synchronises": "synchronizes",
         
     | 
| 1549 | 
         
            +
              "synchronising": "synchronizing",
         
     | 
| 1550 | 
         
            +
              "synthesise": "synthesize",
         
     | 
| 1551 | 
         
            +
              "synthesised": "synthesized",
         
     | 
| 1552 | 
         
            +
              "synthesiser": "synthesizer",
         
     | 
| 1553 | 
         
            +
              "synthesisers": "synthesizers",
         
     | 
| 1554 | 
         
            +
              "synthesises": "synthesizes",
         
     | 
| 1555 | 
         
            +
              "synthesising": "synthesizing",
         
     | 
| 1556 | 
         
            +
              "syphon": "siphon",
         
     | 
| 1557 | 
         
            +
              "syphoned": "siphoned",
         
     | 
| 1558 | 
         
            +
              "syphoning": "siphoning",
         
     | 
| 1559 | 
         
            +
              "syphons": "siphons",
         
     | 
| 1560 | 
         
            +
              "systematisation": "systematization",
         
     | 
| 1561 | 
         
            +
              "systematise": "systematize",
         
     | 
| 1562 | 
         
            +
              "systematised": "systematized",
         
     | 
| 1563 | 
         
            +
              "systematises": "systematizes",
         
     | 
| 1564 | 
         
            +
              "systematising": "systematizing",
         
     | 
| 1565 | 
         
            +
              "tantalise": "tantalize",
         
     | 
| 1566 | 
         
            +
              "tantalised": "tantalized",
         
     | 
| 1567 | 
         
            +
              "tantalises": "tantalizes",
         
     | 
| 1568 | 
         
            +
              "tantalising": "tantalizing",
         
     | 
| 1569 | 
         
            +
              "tantalisingly": "tantalizingly",
         
     | 
| 1570 | 
         
            +
              "tasselled": "tasseled",
         
     | 
| 1571 | 
         
            +
              "technicolour": "technicolor",
         
     | 
| 1572 | 
         
            +
              "temporise": "temporize",
         
     | 
| 1573 | 
         
            +
              "temporised": "temporized",
         
     | 
| 1574 | 
         
            +
              "temporises": "temporizes",
         
     | 
| 1575 | 
         
            +
              "temporising": "temporizing",
         
     | 
| 1576 | 
         
            +
              "tenderise": "tenderize",
         
     | 
| 1577 | 
         
            +
              "tenderised": "tenderized",
         
     | 
| 1578 | 
         
            +
              "tenderises": "tenderizes",
         
     | 
| 1579 | 
         
            +
              "tenderising": "tenderizing",
         
     | 
| 1580 | 
         
            +
              "terrorise": "terrorize",
         
     | 
| 1581 | 
         
            +
              "terrorised": "terrorized",
         
     | 
| 1582 | 
         
            +
              "terrorises": "terrorizes",
         
     | 
| 1583 | 
         
            +
              "terrorising": "terrorizing",
         
     | 
| 1584 | 
         
            +
              "theatre": "theater",
         
     | 
| 1585 | 
         
            +
              "theatregoer": "theatergoer",
         
     | 
| 1586 | 
         
            +
              "theatregoers": "theatergoers",
         
     | 
| 1587 | 
         
            +
              "theatres": "theaters",
         
     | 
| 1588 | 
         
            +
              "theorise": "theorize",
         
     | 
| 1589 | 
         
            +
              "theorised": "theorized",
         
     | 
| 1590 | 
         
            +
              "theorises": "theorizes",
         
     | 
| 1591 | 
         
            +
              "theorising": "theorizing",
         
     | 
| 1592 | 
         
            +
              "tonne": "ton",
         
     | 
| 1593 | 
         
            +
              "tonnes": "tons",
         
     | 
| 1594 | 
         
            +
              "towelled": "toweled",
         
     | 
| 1595 | 
         
            +
              "towelling": "toweling",
         
     | 
| 1596 | 
         
            +
              "toxaemia": "toxemia",
         
     | 
| 1597 | 
         
            +
              "tranquillise": "tranquilize",
         
     | 
| 1598 | 
         
            +
              "tranquillised": "tranquilized",
         
     | 
| 1599 | 
         
            +
              "tranquilliser": "tranquilizer",
         
     | 
| 1600 | 
         
            +
              "tranquillisers": "tranquilizers",
         
     | 
| 1601 | 
         
            +
              "tranquillises": "tranquilizes",
         
     | 
| 1602 | 
         
            +
              "tranquillising": "tranquilizing",
         
     | 
| 1603 | 
         
            +
              "tranquillity": "tranquility",
         
     | 
| 1604 | 
         
            +
              "tranquillize": "tranquilize",
         
     | 
| 1605 | 
         
            +
              "tranquillized": "tranquilized",
         
     | 
| 1606 | 
         
            +
              "tranquillizer": "tranquilizer",
         
     | 
| 1607 | 
         
            +
              "tranquillizers": "tranquilizers",
         
     | 
| 1608 | 
         
            +
              "tranquillizes": "tranquilizes",
         
     | 
| 1609 | 
         
            +
              "tranquillizing": "tranquilizing",
         
     | 
| 1610 | 
         
            +
              "tranquilly": "tranquility",
         
     | 
| 1611 | 
         
            +
              "transistorised": "transistorized",
         
     | 
| 1612 | 
         
            +
              "traumatise": "traumatize",
         
     | 
| 1613 | 
         
            +
              "traumatised": "traumatized",
         
     | 
| 1614 | 
         
            +
              "traumatises": "traumatizes",
         
     | 
| 1615 | 
         
            +
              "traumatising": "traumatizing",
         
     | 
| 1616 | 
         
            +
              "travelled": "traveled",
         
     | 
| 1617 | 
         
            +
              "traveller": "traveler",
         
     | 
| 1618 | 
         
            +
              "travellers": "travelers",
         
     | 
| 1619 | 
         
            +
              "travelling": "traveling",
         
     | 
| 1620 | 
         
            +
              "travelog": "travelogue",
         
     | 
| 1621 | 
         
            +
              "travelogs": "travelogues",
         
     | 
| 1622 | 
         
            +
              "trialled": "trialed",
         
     | 
| 1623 | 
         
            +
              "trialling": "trialing",
         
     | 
| 1624 | 
         
            +
              "tricolour": "tricolor",
         
     | 
| 1625 | 
         
            +
              "tricolours": "tricolors",
         
     | 
| 1626 | 
         
            +
              "trivialise": "trivialize",
         
     | 
| 1627 | 
         
            +
              "trivialised": "trivialized",
         
     | 
| 1628 | 
         
            +
              "trivialises": "trivializes",
         
     | 
| 1629 | 
         
            +
              "trivialising": "trivializing",
         
     | 
| 1630 | 
         
            +
              "tumour": "tumor",
         
     | 
| 1631 | 
         
            +
              "tumours": "tumors",
         
     | 
| 1632 | 
         
            +
              "tunnelled": "tunneled",
         
     | 
| 1633 | 
         
            +
              "tunnelling": "tunneling",
         
     | 
| 1634 | 
         
            +
              "tyrannise": "tyrannize",
         
     | 
| 1635 | 
         
            +
              "tyrannised": "tyrannized",
         
     | 
| 1636 | 
         
            +
              "tyrannises": "tyrannizes",
         
     | 
| 1637 | 
         
            +
              "tyrannising": "tyrannizing",
         
     | 
| 1638 | 
         
            +
              "tyre": "tire",
         
     | 
| 1639 | 
         
            +
              "tyres": "tires",
         
     | 
| 1640 | 
         
            +
              "unauthorised": "unauthorized",
         
     | 
| 1641 | 
         
            +
              "uncivilised": "uncivilized",
         
     | 
| 1642 | 
         
            +
              "underutilised": "underutilized",
         
     | 
| 1643 | 
         
            +
              "unequalled": "unequaled",
         
     | 
| 1644 | 
         
            +
              "unfavourable": "unfavorable",
         
     | 
| 1645 | 
         
            +
              "unfavourably": "unfavorably",
         
     | 
| 1646 | 
         
            +
              "unionisation": "unionization",
         
     | 
| 1647 | 
         
            +
              "unionise": "unionize",
         
     | 
| 1648 | 
         
            +
              "unionised": "unionized",
         
     | 
| 1649 | 
         
            +
              "unionises": "unionizes",
         
     | 
| 1650 | 
         
            +
              "unionising": "unionizing",
         
     | 
| 1651 | 
         
            +
              "unorganised": "unorganized",
         
     | 
| 1652 | 
         
            +
              "unravelled": "unraveled",
         
     | 
| 1653 | 
         
            +
              "unravelling": "unraveling",
         
     | 
| 1654 | 
         
            +
              "unrecognisable": "unrecognizable",
         
     | 
| 1655 | 
         
            +
              "unrecognised": "unrecognized",
         
     | 
| 1656 | 
         
            +
              "unrivalled": "unrivaled",
         
     | 
| 1657 | 
         
            +
              "unsavoury": "unsavory",
         
     | 
| 1658 | 
         
            +
              "untrammelled": "untrammeled",
         
     | 
| 1659 | 
         
            +
              "urbanisation": "urbanization",
         
     | 
| 1660 | 
         
            +
              "urbanise": "urbanize",
         
     | 
| 1661 | 
         
            +
              "urbanised": "urbanized",
         
     | 
| 1662 | 
         
            +
              "urbanises": "urbanizes",
         
     | 
| 1663 | 
         
            +
              "urbanising": "urbanizing",
         
     | 
| 1664 | 
         
            +
              "utilisable": "utilizable",
         
     | 
| 1665 | 
         
            +
              "utilisation": "utilization",
         
     | 
| 1666 | 
         
            +
              "utilise": "utilize",
         
     | 
| 1667 | 
         
            +
              "utilised": "utilized",
         
     | 
| 1668 | 
         
            +
              "utilises": "utilizes",
         
     | 
| 1669 | 
         
            +
              "utilising": "utilizing",
         
     | 
| 1670 | 
         
            +
              "valour": "valor",
         
     | 
| 1671 | 
         
            +
              "vandalise": "vandalize",
         
     | 
| 1672 | 
         
            +
              "vandalised": "vandalized",
         
     | 
| 1673 | 
         
            +
              "vandalises": "vandalizes",
         
     | 
| 1674 | 
         
            +
              "vandalising": "vandalizing",
         
     | 
| 1675 | 
         
            +
              "vaporisation": "vaporization",
         
     | 
| 1676 | 
         
            +
              "vaporise": "vaporize",
         
     | 
| 1677 | 
         
            +
              "vaporised": "vaporized",
         
     | 
| 1678 | 
         
            +
              "vaporises": "vaporizes",
         
     | 
| 1679 | 
         
            +
              "vaporising": "vaporizing",
         
     | 
| 1680 | 
         
            +
              "vapour": "vapor",
         
     | 
| 1681 | 
         
            +
              "vapours": "vapors",
         
     | 
| 1682 | 
         
            +
              "verbalise": "verbalize",
         
     | 
| 1683 | 
         
            +
              "verbalised": "verbalized",
         
     | 
| 1684 | 
         
            +
              "verbalises": "verbalizes",
         
     | 
| 1685 | 
         
            +
              "verbalising": "verbalizing",
         
     | 
| 1686 | 
         
            +
              "victimisation": "victimization",
         
     | 
| 1687 | 
         
            +
              "victimise": "victimize",
         
     | 
| 1688 | 
         
            +
              "victimised": "victimized",
         
     | 
| 1689 | 
         
            +
              "victimises": "victimizes",
         
     | 
| 1690 | 
         
            +
              "victimising": "victimizing",
         
     | 
| 1691 | 
         
            +
              "videodisc": "videodisk",
         
     | 
| 1692 | 
         
            +
              "videodiscs": "videodisks",
         
     | 
| 1693 | 
         
            +
              "vigour": "vigor",
         
     | 
| 1694 | 
         
            +
              "visualisation": "visualization",
         
     | 
| 1695 | 
         
            +
              "visualisations": "visualizations",
         
     | 
| 1696 | 
         
            +
              "visualise": "visualize",
         
     | 
| 1697 | 
         
            +
              "visualised": "visualized",
         
     | 
| 1698 | 
         
            +
              "visualises": "visualizes",
         
     | 
| 1699 | 
         
            +
              "visualising": "visualizing",
         
     | 
| 1700 | 
         
            +
              "vocalisation": "vocalization",
         
     | 
| 1701 | 
         
            +
              "vocalisations": "vocalizations",
         
     | 
| 1702 | 
         
            +
              "vocalise": "vocalize",
         
     | 
| 1703 | 
         
            +
              "vocalised": "vocalized",
         
     | 
| 1704 | 
         
            +
              "vocalises": "vocalizes",
         
     | 
| 1705 | 
         
            +
              "vocalising": "vocalizing",
         
     | 
| 1706 | 
         
            +
              "vulcanised": "vulcanized",
         
     | 
| 1707 | 
         
            +
              "vulgarisation": "vulgarization",
         
     | 
| 1708 | 
         
            +
              "vulgarise": "vulgarize",
         
     | 
| 1709 | 
         
            +
              "vulgarised": "vulgarized",
         
     | 
| 1710 | 
         
            +
              "vulgarises": "vulgarizes",
         
     | 
| 1711 | 
         
            +
              "vulgarising": "vulgarizing",
         
     | 
| 1712 | 
         
            +
              "waggon": "wagon",
         
     | 
| 1713 | 
         
            +
              "waggons": "wagons",
         
     | 
| 1714 | 
         
            +
              "watercolour": "watercolor",
         
     | 
| 1715 | 
         
            +
              "watercolours": "watercolors",
         
     | 
| 1716 | 
         
            +
              "weaselled": "weaseled",
         
     | 
| 1717 | 
         
            +
              "weaselling": "weaseling",
         
     | 
| 1718 | 
         
            +
              "westernisation": "westernization",
         
     | 
| 1719 | 
         
            +
              "westernise": "westernize",
         
     | 
| 1720 | 
         
            +
              "westernised": "westernized",
         
     | 
| 1721 | 
         
            +
              "westernises": "westernizes",
         
     | 
| 1722 | 
         
            +
              "westernising": "westernizing",
         
     | 
| 1723 | 
         
            +
              "womanise": "womanize",
         
     | 
| 1724 | 
         
            +
              "womanised": "womanized",
         
     | 
| 1725 | 
         
            +
              "womaniser": "womanizer",
         
     | 
| 1726 | 
         
            +
              "womanisers": "womanizers",
         
     | 
| 1727 | 
         
            +
              "womanises": "womanizes",
         
     | 
| 1728 | 
         
            +
              "womanising": "womanizing",
         
     | 
| 1729 | 
         
            +
              "woollen": "woolen",
         
     | 
| 1730 | 
         
            +
              "woollens": "woolens",
         
     | 
| 1731 | 
         
            +
              "woollies": "woolies",
         
     | 
| 1732 | 
         
            +
              "woolly": "wooly",
         
     | 
| 1733 | 
         
            +
              "worshipped": "worshiped",
         
     | 
| 1734 | 
         
            +
              "worshipper": "worshiper",
         
     | 
| 1735 | 
         
            +
              "worshipping": "worshiping",
         
     | 
| 1736 | 
         
            +
              "yodelled": "yodeled",
         
     | 
| 1737 | 
         
            +
              "yodelling": "yodeling",
         
     | 
| 1738 | 
         
            +
              "yoghourt": "yogurt",
         
     | 
| 1739 | 
         
            +
              "yoghourts": "yogurts",
         
     | 
| 1740 | 
         
            +
              "yoghurt": "yogurt",
         
     | 
| 1741 | 
         
            +
              "yoghurts": "yogurts"
         
     | 
| 1742 | 
         
            +
            }
         
     | 
    	
        preprocessor_config.json
    ADDED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         | 
    	
        pytorch_model.bin
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:3a3249e6e15f570c9d0efc91ec99b7441c886cb12122c58a274ac6a1822c3b08
         
     | 
| 3 | 
         
            +
            size 3055754841
         
     | 
    	
        runs/Dec14_14-23-12_132-145-140-45/1671027857.0917404/events.out.tfevents.1671027857.132-145-140-45.618344.1
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:38c8777759cf48a0c39e2a29e9457426379fb480f682203bd72ca86b72bbda82
         
     | 
| 3 | 
         
            +
            size 5864
         
     | 
    	
        runs/Dec14_14-23-12_132-145-140-45/events.out.tfevents.1671027857.132-145-140-45.618344.0
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:9cfcadc9139e00668085bff5587670f600edc27ed2cc2099a36aa9ace07a80d2
         
     | 
| 3 | 
         
            +
            size 10894
         
     | 
    	
        special_tokens_map.json
    ADDED
    
    | 
         @@ -0,0 +1,133 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "additional_special_tokens": [
         
     | 
| 3 | 
         
            +
                "<|endoftext|>",
         
     | 
| 4 | 
         
            +
                "<|startoftranscript|>",
         
     | 
| 5 | 
         
            +
                "<|en|>",
         
     | 
| 6 | 
         
            +
                "<|zh|>",
         
     | 
| 7 | 
         
            +
                "<|de|>",
         
     | 
| 8 | 
         
            +
                "<|es|>",
         
     | 
| 9 | 
         
            +
                "<|ru|>",
         
     | 
| 10 | 
         
            +
                "<|ko|>",
         
     | 
| 11 | 
         
            +
                "<|fr|>",
         
     | 
| 12 | 
         
            +
                "<|ja|>",
         
     | 
| 13 | 
         
            +
                "<|pt|>",
         
     | 
| 14 | 
         
            +
                "<|tr|>",
         
     | 
| 15 | 
         
            +
                "<|pl|>",
         
     | 
| 16 | 
         
            +
                "<|ca|>",
         
     | 
| 17 | 
         
            +
                "<|nl|>",
         
     | 
| 18 | 
         
            +
                "<|ar|>",
         
     | 
| 19 | 
         
            +
                "<|sv|>",
         
     | 
| 20 | 
         
            +
                "<|it|>",
         
     | 
| 21 | 
         
            +
                "<|id|>",
         
     | 
| 22 | 
         
            +
                "<|hi|>",
         
     | 
| 23 | 
         
            +
                "<|fi|>",
         
     | 
| 24 | 
         
            +
                "<|vi|>",
         
     | 
| 25 | 
         
            +
                "<|iw|>",
         
     | 
| 26 | 
         
            +
                "<|uk|>",
         
     | 
| 27 | 
         
            +
                "<|el|>",
         
     | 
| 28 | 
         
            +
                "<|ms|>",
         
     | 
| 29 | 
         
            +
                "<|cs|>",
         
     | 
| 30 | 
         
            +
                "<|ro|>",
         
     | 
| 31 | 
         
            +
                "<|da|>",
         
     | 
| 32 | 
         
            +
                "<|hu|>",
         
     | 
| 33 | 
         
            +
                "<|ta|>",
         
     | 
| 34 | 
         
            +
                "<|no|>",
         
     | 
| 35 | 
         
            +
                "<|th|>",
         
     | 
| 36 | 
         
            +
                "<|ur|>",
         
     | 
| 37 | 
         
            +
                "<|hr|>",
         
     | 
| 38 | 
         
            +
                "<|bg|>",
         
     | 
| 39 | 
         
            +
                "<|lt|>",
         
     | 
| 40 | 
         
            +
                "<|la|>",
         
     | 
| 41 | 
         
            +
                "<|mi|>",
         
     | 
| 42 | 
         
            +
                "<|ml|>",
         
     | 
| 43 | 
         
            +
                "<|cy|>",
         
     | 
| 44 | 
         
            +
                "<|sk|>",
         
     | 
| 45 | 
         
            +
                "<|te|>",
         
     | 
| 46 | 
         
            +
                "<|fa|>",
         
     | 
| 47 | 
         
            +
                "<|lv|>",
         
     | 
| 48 | 
         
            +
                "<|bn|>",
         
     | 
| 49 | 
         
            +
                "<|sr|>",
         
     | 
| 50 | 
         
            +
                "<|az|>",
         
     | 
| 51 | 
         
            +
                "<|sl|>",
         
     | 
| 52 | 
         
            +
                "<|kn|>",
         
     | 
| 53 | 
         
            +
                "<|et|>",
         
     | 
| 54 | 
         
            +
                "<|mk|>",
         
     | 
| 55 | 
         
            +
                "<|br|>",
         
     | 
| 56 | 
         
            +
                "<|eu|>",
         
     | 
| 57 | 
         
            +
                "<|is|>",
         
     | 
| 58 | 
         
            +
                "<|hy|>",
         
     | 
| 59 | 
         
            +
                "<|ne|>",
         
     | 
| 60 | 
         
            +
                "<|mn|>",
         
     | 
| 61 | 
         
            +
                "<|bs|>",
         
     | 
| 62 | 
         
            +
                "<|kk|>",
         
     | 
| 63 | 
         
            +
                "<|sq|>",
         
     | 
| 64 | 
         
            +
                "<|sw|>",
         
     | 
| 65 | 
         
            +
                "<|gl|>",
         
     | 
| 66 | 
         
            +
                "<|mr|>",
         
     | 
| 67 | 
         
            +
                "<|pa|>",
         
     | 
| 68 | 
         
            +
                "<|si|>",
         
     | 
| 69 | 
         
            +
                "<|km|>",
         
     | 
| 70 | 
         
            +
                "<|sn|>",
         
     | 
| 71 | 
         
            +
                "<|yo|>",
         
     | 
| 72 | 
         
            +
                "<|so|>",
         
     | 
| 73 | 
         
            +
                "<|af|>",
         
     | 
| 74 | 
         
            +
                "<|oc|>",
         
     | 
| 75 | 
         
            +
                "<|ka|>",
         
     | 
| 76 | 
         
            +
                "<|be|>",
         
     | 
| 77 | 
         
            +
                "<|tg|>",
         
     | 
| 78 | 
         
            +
                "<|sd|>",
         
     | 
| 79 | 
         
            +
                "<|gu|>",
         
     | 
| 80 | 
         
            +
                "<|am|>",
         
     | 
| 81 | 
         
            +
                "<|yi|>",
         
     | 
| 82 | 
         
            +
                "<|lo|>",
         
     | 
| 83 | 
         
            +
                "<|uz|>",
         
     | 
| 84 | 
         
            +
                "<|fo|>",
         
     | 
| 85 | 
         
            +
                "<|ht|>",
         
     | 
| 86 | 
         
            +
                "<|ps|>",
         
     | 
| 87 | 
         
            +
                "<|tk|>",
         
     | 
| 88 | 
         
            +
                "<|nn|>",
         
     | 
| 89 | 
         
            +
                "<|mt|>",
         
     | 
| 90 | 
         
            +
                "<|sa|>",
         
     | 
| 91 | 
         
            +
                "<|lb|>",
         
     | 
| 92 | 
         
            +
                "<|my|>",
         
     | 
| 93 | 
         
            +
                "<|bo|>",
         
     | 
| 94 | 
         
            +
                "<|tl|>",
         
     | 
| 95 | 
         
            +
                "<|mg|>",
         
     | 
| 96 | 
         
            +
                "<|as|>",
         
     | 
| 97 | 
         
            +
                "<|tt|>",
         
     | 
| 98 | 
         
            +
                "<|haw|>",
         
     | 
| 99 | 
         
            +
                "<|ln|>",
         
     | 
| 100 | 
         
            +
                "<|ha|>",
         
     | 
| 101 | 
         
            +
                "<|ba|>",
         
     | 
| 102 | 
         
            +
                "<|jw|>",
         
     | 
| 103 | 
         
            +
                "<|su|>",
         
     | 
| 104 | 
         
            +
                "<|translate|>",
         
     | 
| 105 | 
         
            +
                "<|transcribe|>",
         
     | 
| 106 | 
         
            +
                "<|startoflm|>",
         
     | 
| 107 | 
         
            +
                "<|startofprev|>",
         
     | 
| 108 | 
         
            +
                "<|nocaptions|>",
         
     | 
| 109 | 
         
            +
                "<|notimestamps|>"
         
     | 
| 110 | 
         
            +
              ],
         
     | 
| 111 | 
         
            +
              "bos_token": {
         
     | 
| 112 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 113 | 
         
            +
                "lstrip": false,
         
     | 
| 114 | 
         
            +
                "normalized": true,
         
     | 
| 115 | 
         
            +
                "rstrip": false,
         
     | 
| 116 | 
         
            +
                "single_word": false
         
     | 
| 117 | 
         
            +
              },
         
     | 
| 118 | 
         
            +
              "eos_token": {
         
     | 
| 119 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 120 | 
         
            +
                "lstrip": false,
         
     | 
| 121 | 
         
            +
                "normalized": true,
         
     | 
| 122 | 
         
            +
                "rstrip": false,
         
     | 
| 123 | 
         
            +
                "single_word": false
         
     | 
| 124 | 
         
            +
              },
         
     | 
| 125 | 
         
            +
              "pad_token": "<|endoftext|>",
         
     | 
| 126 | 
         
            +
              "unk_token": {
         
     | 
| 127 | 
         
            +
                "content": "",
         
     | 
| 128 | 
         
            +
                "lstrip": false,
         
     | 
| 129 | 
         
            +
                "normalized": true,
         
     | 
| 130 | 
         
            +
                "rstrip": false,
         
     | 
| 131 | 
         
            +
                "single_word": false
         
     | 
| 132 | 
         
            +
              }
         
     | 
| 133 | 
         
            +
            }
         
     | 
    	
        tokenizer_config.json
    ADDED
    
    | 
         @@ -0,0 +1,36 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            {
         
     | 
| 2 | 
         
            +
              "add_bos_token": false,
         
     | 
| 3 | 
         
            +
              "add_prefix_space": false,
         
     | 
| 4 | 
         
            +
              "bos_token": {
         
     | 
| 5 | 
         
            +
                "__type": "AddedToken",
         
     | 
| 6 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 7 | 
         
            +
                "lstrip": false,
         
     | 
| 8 | 
         
            +
                "normalized": true,
         
     | 
| 9 | 
         
            +
                "rstrip": false,
         
     | 
| 10 | 
         
            +
                "single_word": false
         
     | 
| 11 | 
         
            +
              },
         
     | 
| 12 | 
         
            +
              "eos_token": {
         
     | 
| 13 | 
         
            +
                "__type": "AddedToken",
         
     | 
| 14 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 15 | 
         
            +
                "lstrip": false,
         
     | 
| 16 | 
         
            +
                "normalized": true,
         
     | 
| 17 | 
         
            +
                "rstrip": false,
         
     | 
| 18 | 
         
            +
                "single_word": false
         
     | 
| 19 | 
         
            +
              },
         
     | 
| 20 | 
         
            +
              "errors": "replace",
         
     | 
| 21 | 
         
            +
              "model_max_length": 1024,
         
     | 
| 22 | 
         
            +
              "name_or_path": "juancopi81/whisper-medium-es-common-fleurs",
         
     | 
| 23 | 
         
            +
              "pad_token": null,
         
     | 
| 24 | 
         
            +
              "processor_class": "WhisperProcessor",
         
     | 
| 25 | 
         
            +
              "return_attention_mask": false,
         
     | 
| 26 | 
         
            +
              "special_tokens_map_file": null,
         
     | 
| 27 | 
         
            +
              "tokenizer_class": "WhisperTokenizer",
         
     | 
| 28 | 
         
            +
              "unk_token": {
         
     | 
| 29 | 
         
            +
                "__type": "AddedToken",
         
     | 
| 30 | 
         
            +
                "content": "",
         
     | 
| 31 | 
         
            +
                "lstrip": false,
         
     | 
| 32 | 
         
            +
                "normalized": true,
         
     | 
| 33 | 
         
            +
                "rstrip": false,
         
     | 
| 34 | 
         
            +
                "single_word": false
         
     | 
| 35 | 
         
            +
              }
         
     | 
| 36 | 
         
            +
            }
         
     | 
    	
        training_args.bin
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:cba2e3c972f6ddedbda56d25f4cd1efc0f88bae273d74a2256414dc2a071f223
         
     | 
| 3 | 
         
            +
            size 3579
         
     | 
    	
        vocab.json
    ADDED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         |