File size: 9,702 Bytes
ef32dc8 8be74ba 93010f6 5f8d3f6 8be74ba 24c2afb 8be74ba bba4630 0f588e7 8be74ba 6a52628 8be74ba 0f588e7 8be74ba 5769259 8be74ba 0f588e7 8be74ba 3acc258 8be74ba 3acc258 8be74ba 5769259 8be74ba a4430c5 dde4f6f a4430c5 f7ce062 8afc8d1 a4430c5 8be74ba e16bf2b 8be74ba ef32dc8 8be74ba ef32dc8 8be74ba ef32dc8 8be74ba ef32dc8 8be74ba b0a4174 ef32dc8 8be74ba ef32dc8 ec69747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gradio as gr
import pandas as pd
banner_url = "https://huggingface.co/spaces/elmresearchcenter/open_universal_arabic_asr_leaderboard/resolve/main/banner.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 10vw; max-width: 600px;"> </div>'
INTRODUCTION_TEXT = "๐**Open Universal Arabic ASR Leaderboard**๐ benchmarks multi-dialect Arabic ASR models on various multi-dialect datasets.<br>Apart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.<br>To reproduce the benchmark numbers and request a model that is not listed, you can launch an issue/PR in our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard)๐.<br>For more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check our [paper](https://arxiv.org/pdf/2412.13788)."
CITATION_BUTTON_TEXT = """
@article{wang2024open,
title={Open Universal Arabic ASR Leaderboard},
author={Wang, Yingzhi and Alhmoud, Anas and Alqurishi, Muhammad},
journal={arXiv preprint arXiv:2412.13788},
year={2024}
}
"""
METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
## Metrics
We report both the Word Error Rate (WER) and Character Error Rate (CER).
## Reproduction
The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project.
\nWe open-source the evaluation scripts at our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard).
\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.
## Benchmark datasets
| Test Set | Num Dialects | Test (h) |
|-------------------------------------------------------------------------------------------------|----------------|-------------|
| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022) | 10 | 10.7 |
| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets) | 25 | 12.6 |
| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus) | 7 | 10.5 |
| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus) | 8 | 14.9 |
| [MGB-2](http://www.mgb-challenge.org/MGB-2.html) | Unspecified | 9.6 |
| [Casablanca](https://huggingface.co/datasets/UBC-NLP/Casablanca) | 8 | 7.7 |
## In-depth Analysis
We also provide a comprehensive analysis of the models' robustness, speaker adaptation, inference efficiency and memory consumption.
\nPlease check our [paper](https://arxiv.org/pdf/2412.13788) to learn more.
"""
def styled_message(message):
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
UPDATES = "Nov 13th 2025:[New models included: 8 omnilingual-asr CTC&LLM models<br>Sep 30th 2025:[New models included: Qwen3-Omni-30B-A3B-Instruct<br>Sep 22th 2025:[New models included: Voxtral-mini and Voxtral-Small]<br>Jan 11th 2025:[New models included: Nvidia Parakeet-CTC-XXL-1.1B-Universal and Nvidia Parakeet-CTC-XXL-1.1B-Concat]<br>Jan 11th 2025:[New dataset included: Casablanca]"
results = {
"Model": ["omnilingual-asr/omniASR_LLM_7B", "omnilingual-asr/omniASR_LLM_3B", "omnilingual-asr/omniASR_LLM_1B", "Qwen/Qwen3-Omni-30B-A3B-Instruct", "nvidia-conformer-ctc-large-arabic (lm)", "omnilingual-asr/omniASR_LLM_300M", "mistralai/Voxtral-Small-24B-2507", "nvidia-conformer-ctc-large-arabic (greedy)", "openai/whisper-large-v3", "omnilingual-asr/omniASR_CTC_3B", "omnilingual-asr/omniASR_CTC_7B", "facebook/seamless-m4t-v2-large", "omnilingual-asr/omniASR_CTC_1B", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "mistralai/Voxtral-Mini-3B-2507", "asafaya/hubert-large-arabic-transcribe", "openai/whisper-medium", "nvidia-Parakeet-ctc-1.1b-concat", "omnilingual-asr/omniASR_CTC_300M", "nvidia-Parakeet-ctc-1.1b-universal", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
"Average WERโฌ๏ธ": [28.32, 29.96, 29.96, 30.71, 32.91, 32.96, 34.47, 34.74, 36.86, 37.78, 38.12, 38.16, 39.29, 40.05, 40.20, 42.57, 42.58, 45.50, 45.57, 46.54, 46.65, 51.96, 54.54, 55.13, 58.13, 60.98, 65.74],
"Average CER": [12.52, 13.77, 13.40, 13.67, 13.84, 14.84, 15.29, 13.37, 17.21, 19.79, 20.91, 17.03, 20.47, 18.87, 19.55, 20.49, 19.90, 17.35, 22.27, 23.88, 21.86, 25.19, 21.45, 21.68, 27.62, 25.61, 30.93],
"SADA WER": [41.61, 46.18, 43.84, 44.82, 44.52, 51.38, 50.82, 47.26, 55.96, 69.85, 72.69, 62.52, 71.42, 60.36, 57.46, 63.24, 63.65, 67.82, 67.71, 70.70, 78.11, 73.58, 77.48, 78.02, 87.34, 86.82, 88.54],
"SADA CER": [24.95, 27.27, 24.54, 26.11, 23.76, 29.10, 28.85, 22.54, 34.62, 51.70, 54.95, 37.61, 52.33, 37.67, 36.59, 40.16, 35.89, 31.83, 43.83, 46.70, 52.52, 49.48, 37.50, 33.17, 56.75, 44.20, 50.28],
"Common Voice\nWER": [8.75, 9.15, 9.55, 11.46, 8.80, 12.03, 15.25, 10.60, 17.83, 14.19, 12.47, 21.70, 17.55, 25.73, 21.77, 26.04, 22.12, 8.01, 28.07, 26.34, 27.90, 40.01, 26.52, 24.18, 41.79, 23.00, 29.17],
"Common Voice\nCER": [2.71, 2.80, 2.97, 4.28, 2.77, 4.04, 5.54, 3.05, 5.74, 5.74, 5.36, 6.24, 7.97, 10.89, 7.44, 9.61, 8.44, 2.37, 10.38, 9.82, 11.66, 14.64, 7.21, 6.79, 15.75, 6.64, 9.85],
"MASC(clean-test)\nWER": [19.69, 19.90, 20.03, 21.47, 23.74, 20.66, 23.96, 24.12, 24.66, 21.48, 21.08, 25.04, 22.76, 25.51, 27.25, 28.89, 28.37, 32.94, 29.99, 30.49, 28.40, 36.16, 38.82, 35.93, 37.82, 42.75, 49.10],
"MASC(clean-test)\nCER": [5.76, 6.13, 6.14, 5.59, 5.63, 6.22, 7.06, 5.63, 7.24, 6.11, 6.22, 7.19, 6.36, 7.55, 8.28, 9.05, 8.73, 7.15, 8.98, 8.41, 7.76, 10.29, 10.36, 9.01, 11.92, 11.87, 16.37],
"MASC(noisy-test)\nWER": [29.29, 30.03, 30.26, 30.85, 34.29, 32.45, 34.43, 35.64, 34.63, 34.60, 35.04, 33.24, 35.73, 37.16, 38.55, 40.79, 41.27, 50.16, 42.91, 45.95, 43.26, 50.03, 57.33, 56.36, 53.28, 64.27, 69.57],
"MASC(noisy-test)\nCER": [10.66, 11.27, 11.18, 11.28, 11.07, 12.23, 12.22, 11.02, 12.89, 12.32, 13.57, 11.92, 12.52, 13.93, 15.49, 16.31, 16.44, 15.62, 17.49, 18.72, 14.89, 20.09, 19.76, 19.43, 21.93, 24.17, 30.17],
"MGB-2 WER": [14.13, 14.22, 15.34, 13.09, 17.20, 16.58, 16.03, 19.69, 16.26, 18.96, 20.43, 20.23, 19.96, 17.75, 25.17, 24.28, 22.56, 37.51, 29.32, 24.94, 26.85, 30.68, 39.16, 48.64, 40.66, 56.29, 64.37],
"MGB-2 CER": [7.10, 7.06, 7.56, 6.20, 6.87, 7.86, 7.41, 7.46, 7.74, 8.28, 9.78, 9.37, 8.56, 8.34, 13.48, 12.10, 10.46, 11.07, 14.82, 9.87, 10.03, 11.36, 13.48, 15.56, 19.39, 20.44, 26.56],
"Casablanca\nWER": [56.46, 60.27, 60.68, 62.55, 68.90, 64.64, 66.30, 71.13, 71.81, 67.58, 67.02, 66.25, 68.32, 73.79, 71.01, 72.18, 77.52, 76.53, 75.44, 80.80, 75.35, 81.30, 87.95, 87.64, 87.88, 92.72, 93.68],
"Casablanca\nCER": [23.96, 28.06, 28.02, 28.53, 32.97, 29.61, 30.64, 30.50, 35.04, 34.59, 35.60, 29.85, 35.08, 34.83, 36.00, 35.71, 39.43, 36.03, 38.12, 49.77, 34.29, 45.31, 40.41, 46.12, 39.99, 46.33, 52.36],
}
original_df = pd.DataFrame(results)
original_df.sort_values(by="Average WERโฌ๏ธ", inplace=True)
TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
LEADERBOARD_CSS = """
html, body {
overflow-y: auto !important;
}
#leaderboard-table th .header-content {
min-width: 150px;
white-space: nowrap;
}
"""
def request_model(model_text):
return styled_message("๐ค Please launch a discussion in our GitHub repo, thank you. ๐ค")
with gr.Blocks(fill_width=False, fill_height=False, css=LEADERBOARD_CSS) as demo:
gr.HTML(BANNER, elem_id="banner")
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("๐
Leaderboard", elem_id="od-benchmark-tab-table", id=0):
leaderboard_table = gr.Dataframe(
value=original_df,
datatype=TYPES,
elem_id="leaderboard-table",
interactive=False,
visible=True,
)
with gr.TabItem("๐ Metrics", elem_id="od-benchmark-tab-table", id=1):
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
with gr.TabItem("โ๏ธโจ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
with gr.Column():
gr.Markdown("# โ๏ธโจ Request results for a new model here!", elem_classes="markdown-text")
model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
mdw_submission_result = gr.Markdown()
btn_submit = gr.Button(value="๐ Request")
btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)
gr.Markdown(UPDATES, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("๐ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT, lines=7,
label="Copy the BibTeX snippet to cite this source",
elem_id="citation-button",
show_copy_button=True,
)
demo.launch(allowed_paths=["banner.png"], ssr_mode=False) |