Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| os.system('pip install dashscope -U') | |
| import dashscope | |
| from dashscope import MultiModalConversation | |
| API_KEY = os.environ['API_KEY'] | |
| dashscope.api_key = API_KEY | |
| dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" | |
| def asr_inference(audio_file, context, language, enable_itn): | |
| if not audio_file: | |
| return "Please upload an audio file" | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"text": context}, | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"audio": audio_file}, | |
| ] | |
| } | |
| ] | |
| if language == 'auto': | |
| response = MultiModalConversation.call( | |
| model="qwen3-asr-flash", | |
| messages=messages, | |
| result_format="message", | |
| asr_options={ | |
| "enable_lid": True, | |
| "enable_itn": enable_itn | |
| } | |
| ) | |
| else: | |
| response = MultiModalConversation.call( | |
| model="qwen3-asr-flash", | |
| messages=messages, | |
| result_format="message", | |
| asr_options={ | |
| "language": language, | |
| "enable_lid": True, | |
| "enable_itn": enable_itn | |
| } | |
| ) | |
| try: | |
| if hasattr(response, 'status_code') and response.status_code == 200: | |
| if (hasattr(response, 'output') and | |
| hasattr(response.output, 'choices') and | |
| len(response.output.choices) > 0): | |
| choice = response.output.choices[0] | |
| if (hasattr(choice, 'message') and | |
| hasattr(choice.message, 'content') and | |
| len(choice.message.content) > 0): | |
| content = choice.message.content[0] | |
| if 'text' in content: | |
| result_text = content['text'] | |
| if language == 'auto' and hasattr(choice.message, "annotations"): | |
| result_lang = choice.message.annotations[0]['language'] | |
| else: | |
| result_lang = None | |
| else: | |
| result_text = "No text content found" | |
| result_lang = None | |
| else: | |
| result_text = "Incomplete response structure" | |
| result_lang = None | |
| else: | |
| result_text = "No recognition result found in response" | |
| result_lang = None | |
| else: | |
| status_code = getattr(response, 'status_code', 'Unknown') | |
| error_msg = getattr(response, 'message', 'Unknown error') | |
| result_text = f"Request failed (Status: {status_code}): {error_msg}" | |
| result_lang = None | |
| except Exception as e: | |
| result_text = f"Processing error: {str(e)}" | |
| result_lang = None | |
| # Map result_lang to display name | |
| lang_display = { | |
| "auto": "Auto Detect", | |
| "zh": "Chinese", | |
| "en": "English", | |
| "ja": "Japanese", | |
| "ko": "Korean", | |
| "es": "Spanish", | |
| "fr": "French", | |
| "de": "German", | |
| "ar": "Arabic", | |
| "it": "Italian", | |
| "ru": "Russian", | |
| "pt": "Portuguese" | |
| } | |
| if result_lang in lang_display: | |
| result_lang = lang_display[result_lang] | |
| elif result_lang is not None: | |
| result_lang = f"Unknown Language ({result_lang})" | |
| return result_text, result_lang | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo: | |
| # ========== LOGO Area (Centered + Enlarged) ========== | |
| gr.Markdown(""" | |
| <div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;"> | |
| <img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png" | |
| alt="Qwen-ASR Logo" | |
| width="300" | |
| style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/> | |
| </div> | |
| """, sanitize_html=False) | |
| # ========== API Documentation Link ========== | |
| gr.Markdown(""" | |
| <div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;"> | |
| π <a href="https://help.aliyun.com/zh/dashscope/developer-reference/" | |
| target="_blank" | |
| style="color: #0066cc; text-decoration: none;"> | |
| View DashScope API Documentation | |
| </a> | |
| </div> | |
| """, sanitize_html=False) | |
| gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(label="π€ Upload Audio", type="filepath") | |
| context_input = gr.Textbox(label="π Context (Optional)", value="", interactive=True) | |
| language = gr.Dropdown( | |
| label="π Language Setting", | |
| choices=[ | |
| ("Auto Detect", "auto"), | |
| ("Chinese", "zh"), | |
| ("English", "en"), | |
| ("Japanese", "ja"), | |
| ("Korean", "ko"), | |
| ("Spanish", "es"), | |
| ("French", "fr"), | |
| ("German", "de"), | |
| ("Arabic", "ar"), | |
| ("Italian", "it"), | |
| ("Russian", "ru"), | |
| ("Portuguese", "pt") | |
| ], | |
| value="auto" | |
| ) | |
| enable_itn = gr.Checkbox(label="π Enable Inverse Text Normalization (ITN)", value=False) | |
| submit_btn = gr.Button("π Start Recognition", variant="primary") | |
| with gr.Column(): | |
| text_output = gr.Textbox(label="π Recognition Result", interactive=False, lines=6, max_lines=12) | |
| lang_output = gr.Textbox(label="π Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12) | |
| submit_btn.click( | |
| fn=asr_inference, | |
| inputs=[audio_input, context_input, language, enable_itn], | |
| outputs=[text_output, lang_output] | |
| ) | |
| # Example Section | |
| gr.Markdown("### π‘ Examples") | |
| examples_data = { | |
| "Example 1 - CSGO Match": { | |
| "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav", | |
| "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.", | |
| "description": "Game commentary (Pro Terms & Names)" | |
| }, | |
| "Example 2 - Noisy Environment": { | |
| "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav", | |
| "context": "", | |
| "description": "English Recognition in Noise" | |
| }, | |
| "Example 3 - Complex Audio": { | |
| "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav", | |
| "context": "", | |
| "description": "Dialect Recognition in Heavy Noise" | |
| } | |
| } | |
| with gr.Row(): | |
| for title, data in examples_data.items(): | |
| with gr.Column(): | |
| example_btn = gr.Button(f"π {title}", variant="secondary", size="sm") | |
| gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"]) | |
| example_btn.click( | |
| fn=lambda audio=data['audio'], context=data['context']: (audio, context), | |
| outputs=[audio_input, context_input] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |