Spaces:

Qwen
/

Qwen3-ASR-Demo

Running

App Files Files Community

Qwen3-ASR-Demo / app.py

littlebird13

Update app.py

4d811e8 verified about 2 months ago

raw

history blame contribute delete

7.69 kB

	import gradio as gr
	import os
	os.system('pip install dashscope -U')
	import dashscope
	from dashscope import MultiModalConversation

	API_KEY = os.environ['API_KEY']

	dashscope.api_key = API_KEY
	dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"

	def asr_inference(audio_file, context, language, enable_itn):
	if not audio_file:
	return "Please upload an audio file"

	messages = [
	{
	"role": "system",
	"content": [
	{"text": context},
	]
	},
	{
	"role": "user",
	"content": [
	{"audio": audio_file},
	]
	}
	]

	if language == 'auto':
	response = MultiModalConversation.call(
	model="qwen3-asr-flash",
	messages=messages,
	result_format="message",
	asr_options={
	"enable_lid": True,
	"enable_itn": enable_itn
	}
	)
	else:
	response = MultiModalConversation.call(
	model="qwen3-asr-flash",
	messages=messages,
	result_format="message",
	asr_options={
	"language": language,
	"enable_lid": True,
	"enable_itn": enable_itn
	}
	)

	try:
	if hasattr(response, 'status_code') and response.status_code == 200:
	if (hasattr(response, 'output') and
	hasattr(response.output, 'choices') and
	len(response.output.choices) > 0):

	choice = response.output.choices[0]
	if (hasattr(choice, 'message') and
	hasattr(choice.message, 'content') and
	len(choice.message.content) > 0):

	content = choice.message.content[0]
	if 'text' in content:
	result_text = content['text']
	if language == 'auto' and hasattr(choice.message, "annotations"):
	result_lang = choice.message.annotations[0]['language']
	else:
	result_lang = None
	else:
	result_text = "No text content found"
	result_lang = None
	else:
	result_text = "Incomplete response structure"
	result_lang = None
	else:
	result_text = "No recognition result found in response"
	result_lang = None
	else:
	status_code = getattr(response, 'status_code', 'Unknown')
	error_msg = getattr(response, 'message', 'Unknown error')
	result_text = f"Request failed (Status: {status_code}): {error_msg}"
	result_lang = None

	except Exception as e:
	result_text = f"Processing error: {str(e)}"
	result_lang = None

	# Map result_lang to display name
	lang_display = {
	"auto": "Auto Detect",
	"zh": "Chinese",
	"en": "English",
	"ja": "Japanese",
	"ko": "Korean",
	"es": "Spanish",
	"fr": "French",
	"de": "German",
	"ar": "Arabic",
	"it": "Italian",
	"ru": "Russian",
	"pt": "Portuguese"
	}
	if result_lang in lang_display:
	result_lang = lang_display[result_lang]
	elif result_lang is not None:
	result_lang = f"Unknown Language ({result_lang})"

	return result_text, result_lang


	with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo:
	# ========== LOGO Area (Centered + Enlarged) ==========
	gr.Markdown("""
	<div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
	<img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png"
	alt="Qwen-ASR Logo"
	width="300"
	style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
	</div>
	""", sanitize_html=False)

	# ========== API Documentation Link ==========
	gr.Markdown("""
	<div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
	🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/"
	target="_blank"
	style="color: #0066cc; text-decoration: none;">
	View DashScope API Documentation
	</a>
	</div>
	""", sanitize_html=False)

	gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(label="🎤 Upload Audio", type="filepath")
	context_input = gr.Textbox(label="📝 Context (Optional)", value="", interactive=True)
	language = gr.Dropdown(
	label="🌍 Language Setting",
	choices=[
	("Auto Detect", "auto"),
	("Chinese", "zh"),
	("English", "en"),
	("Japanese", "ja"),
	("Korean", "ko"),
	("Spanish", "es"),
	("French", "fr"),
	("German", "de"),
	("Arabic", "ar"),
	("Italian", "it"),
	("Russian", "ru"),
	("Portuguese", "pt")
	],
	value="auto"
	)
	enable_itn = gr.Checkbox(label="🔄 Enable Inverse Text Normalization (ITN)", value=False)
	submit_btn = gr.Button("🚀 Start Recognition", variant="primary")

	with gr.Column():
	text_output = gr.Textbox(label="📝 Recognition Result", interactive=False, lines=6, max_lines=12)
	lang_output = gr.Textbox(label="📝 Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12)

	submit_btn.click(
	fn=asr_inference,
	inputs=[audio_input, context_input, language, enable_itn],
	outputs=[text_output, lang_output]
	)


	# Example Section
	gr.Markdown("### 💡 Examples")

	examples_data = {
	"Example 1 - CSGO Match": {
	"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
	"context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
	"description": "Game commentary (Pro Terms & Names)"
	},
	"Example 2 - Noisy Environment": {
	"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
	"context": "",
	"description": "English Recognition in Noise"
	},
	"Example 3 - Complex Audio": {
	"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
	"context": "",
	"description": "Dialect Recognition in Heavy Noise"
	}
	}

	with gr.Row():
	for title, data in examples_data.items():
	with gr.Column():
	example_btn = gr.Button(f"📎 {title}", variant="secondary", size="sm")
	gr.Markdown(f"{data['description']}", elem_classes=["example-desc"])

	example_btn.click(
	fn=lambda audio=data['audio'], context=data['context']: (audio, context),
	outputs=[audio_input, context_input]
	)

	if __name__ == "__main__":
	demo.launch()