import subprocess import torch import gradio as gr import spaces import pandas as pd from nemo.collections.asr.models import ASRModel from nemo_align import align_tdt_to_ctc_timestamps import os device = "cuda" if torch.cuda.is_available() else "cpu" def process_audio(input_file, output_file): gr.Info("Processing audio to single channel and sampling rate to 16000") command = [ 'sox', input_file, output_file, 'channels', '1', 'rate', '16000' ] try: subprocess.run(command, check=True) gr.Info("Audio processed successfully") return output_file except: raise gr.Error("Failed to convert audio to single channel and sampling rate to 16000") def get_dataframe_segments(segments): df = pd.DataFrame(columns=['start_time', 'end_time', 'text']) if len(segments) == 0: df.loc[0] = 0, 0, '' return df for segment in segments: text, start_time, end_time = segment if len(text)>0: df.loc[len(df)] = round(start_time, 2), round(end_time, 2), text return df def get_transcripts(audio_path, model): with torch.amp.autocast(device, dtype=torch.bfloat16, enabled=True): with torch.inference_mode(): text = model.transcribe(audio=[audio_path], ) return text def pick_asr_model(): model = 'nvidia/parakeet-tdt_ctc-1.1b' asr_model = ASRModel.from_pretrained(model).to(device) asr_model.cfg.decoding.strategy = "greedy_batch" asr_model.change_decoding_strategy(asr_model.cfg.decoding) asr_model.eval() return asr_model asr_model = pick_asr_model() @spaces.GPU def run_nemo_models(microphone, audio_path): path1 = microphone if microphone else audio_path new_path = process_audio(path1, "processed_audio.flac") gr.Info("Running NeMo Model") text = get_transcripts(new_path, asr_model) segments = align_tdt_to_ctc_timestamps(text, asr_model, new_path) df = get_dataframe_segments(segments) return df # def run_speaker_diarization() with gr.Blocks( title="NeMo Parakeet Model", css=""" textarea { font-size: 18px;} #model_output_text_box span { font-size: 18px; font-weight: bold; } """, theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md ) ) as demo: gr.HTML("