| import torch | |
| import torchaudio | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| repo_id = "Sven33/maze-whisper-3000" | |
| processor = WhisperProcessor.from_pretrained(repo_id) | |
| model = WhisperForConditionalGeneration.from_pretrained(repo_id) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| speech_array, sampling_rate = torchaudio.load("../../data/test_audio/673_clip.wav") | |
| if sampling_rate != 16000: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) | |
| speech_array = resampler(speech_array) | |
| input_audio = speech_array[0].numpy() | |
| inputs = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_features.to(device) | |
| predicted_ids = model.generate(inputs) | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| print("Transcription:") | |
| print(transcription) | |