open_universal_arabic_asr_leaderboard

Running

File size: 9,702 Bytes

ef32dc8
 
8be74ba
93010f6
5f8d3f6
8be74ba
24c2afb
8be74ba
bba4630
0f588e7
 
 
 
 
 
8be74ba
 
 
 
6a52628
8be74ba
 
0f588e7
8be74ba
 
 
 
 
 
 
 
 
 
5769259
8be74ba
 
0f588e7
 
8be74ba
 
 
 
 
 
3acc258
8be74ba
 
3acc258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be74ba
 
 
 
 
5769259
8be74ba
a4430c5
dde4f6f
 
 
 
a4430c5
f7ce062
8afc8d1
a4430c5
 
8be74ba
 
 
 
e16bf2b
8be74ba
ef32dc8
 
 
8be74ba
 
 
 
 
 
 
 
ef32dc8
8be74ba
 
ef32dc8
8be74ba
ef32dc8
8be74ba
 
 
 
 
b0a4174
 
ef32dc8
 
 
8be74ba
 
 
ef32dc8
 
 
 
ec69747

import gradio as gr
import pandas as pd

banner_url = "https://huggingface.co/spaces/elmresearchcenter/open_universal_arabic_asr_leaderboard/resolve/main/banner.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 10vw; max-width: 600px;"> </div>'

INTRODUCTION_TEXT = "📖**Open Universal Arabic ASR Leaderboard**📖 benchmarks multi-dialect Arabic ASR models on various multi-dialect datasets.<br>Apart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.<br>To reproduce the benchmark numbers and request a model that is not listed, you can launch an issue/PR in our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard)😊.<br>For more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check our [paper](https://arxiv.org/pdf/2412.13788)."

CITATION_BUTTON_TEXT = """
@article{wang2024open,
  title={Open Universal Arabic ASR Leaderboard},
  author={Wang, Yingzhi and Alhmoud, Anas and Alqurishi, Muhammad},
  journal={arXiv preprint arXiv:2412.13788},
  year={2024}
}
"""

METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
## Metrics
We report both the Word Error Rate (WER) and Character Error Rate (CER).
## Reproduction
The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project. 
\nWe open-source the evaluation scripts at our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard).
\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.

## Benchmark datasets
| Test Set                                                                                        | Num Dialects   | Test (h)    |
|-------------------------------------------------------------------------------------------------|----------------|-------------|
| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022)                                      | 10             | 10.7        |
| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets)                                | 25             | 12.6        |
| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 7              | 10.5        |
| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 8              | 14.9        |
| [MGB-2](http://www.mgb-challenge.org/MGB-2.html)                                                | Unspecified    | 9.6         |
| [Casablanca](https://huggingface.co/datasets/UBC-NLP/Casablanca)                                | 8              | 7.7         |

## In-depth Analysis
We also provide a comprehensive analysis of the models' robustness, speaker adaptation, inference efficiency and memory consumption.
\nPlease check our [paper](https://arxiv.org/pdf/2412.13788) to learn more.
"""


def styled_message(message):
    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"

UPDATES = "Nov 13th 2025:[New models included: 8 omnilingual-asr CTC&LLM models<br>Sep 30th 2025:[New models included: Qwen3-Omni-30B-A3B-Instruct<br>Sep 22th 2025:[New models included: Voxtral-mini and Voxtral-Small]<br>Jan 11th 2025:[New models included: Nvidia Parakeet-CTC-XXL-1.1B-Universal and Nvidia Parakeet-CTC-XXL-1.1B-Concat]<br>Jan 11th 2025:[New dataset included: Casablanca]"

results = {
    "Model": ["omnilingual-asr/omniASR_LLM_7B", "omnilingual-asr/omniASR_LLM_3B", "omnilingual-asr/omniASR_LLM_1B", "Qwen/Qwen3-Omni-30B-A3B-Instruct", "nvidia-conformer-ctc-large-arabic (lm)", "omnilingual-asr/omniASR_LLM_300M", "mistralai/Voxtral-Small-24B-2507", "nvidia-conformer-ctc-large-arabic (greedy)", "openai/whisper-large-v3", "omnilingual-asr/omniASR_CTC_3B", "omnilingual-asr/omniASR_CTC_7B", "facebook/seamless-m4t-v2-large", "omnilingual-asr/omniASR_CTC_1B", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "mistralai/Voxtral-Mini-3B-2507", "asafaya/hubert-large-arabic-transcribe", "openai/whisper-medium", "nvidia-Parakeet-ctc-1.1b-concat", "omnilingual-asr/omniASR_CTC_300M", "nvidia-Parakeet-ctc-1.1b-universal", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
    "Average WER⬇️": [28.32, 29.96, 29.96, 30.71, 32.91, 32.96, 34.47, 34.74, 36.86, 37.78, 38.12, 38.16, 39.29, 40.05, 40.20, 42.57, 42.58, 45.50, 45.57, 46.54, 46.65, 51.96, 54.54, 55.13, 58.13, 60.98, 65.74],
    "Average CER": [12.52, 13.77, 13.40, 13.67, 13.84, 14.84, 15.29, 13.37, 17.21, 19.79, 20.91, 17.03, 20.47, 18.87, 19.55, 20.49, 19.90, 17.35, 22.27, 23.88, 21.86, 25.19, 21.45, 21.68, 27.62, 25.61, 30.93],
    "SADA WER": [41.61, 46.18, 43.84, 44.82, 44.52, 51.38, 50.82, 47.26, 55.96, 69.85, 72.69, 62.52, 71.42, 60.36, 57.46, 63.24, 63.65, 67.82, 67.71, 70.70, 78.11, 73.58, 77.48, 78.02, 87.34, 86.82, 88.54],
    "SADA CER": [24.95, 27.27, 24.54, 26.11, 23.76, 29.10, 28.85, 22.54, 34.62, 51.70, 54.95, 37.61, 52.33, 37.67, 36.59, 40.16, 35.89, 31.83, 43.83, 46.70, 52.52, 49.48, 37.50, 33.17, 56.75, 44.20, 50.28],
    "Common Voice\nWER": [8.75, 9.15, 9.55, 11.46, 8.80, 12.03, 15.25, 10.60, 17.83, 14.19, 12.47, 21.70, 17.55, 25.73, 21.77, 26.04, 22.12, 8.01, 28.07, 26.34, 27.90, 40.01, 26.52, 24.18, 41.79, 23.00, 29.17],
    "Common Voice\nCER": [2.71, 2.80, 2.97, 4.28, 2.77, 4.04, 5.54, 3.05, 5.74, 5.74, 5.36, 6.24, 7.97, 10.89, 7.44, 9.61, 8.44, 2.37, 10.38, 9.82, 11.66, 14.64, 7.21, 6.79, 15.75, 6.64, 9.85],
    "MASC(clean-test)\nWER": [19.69, 19.90, 20.03, 21.47, 23.74, 20.66, 23.96, 24.12, 24.66, 21.48, 21.08, 25.04, 22.76, 25.51, 27.25, 28.89, 28.37, 32.94, 29.99, 30.49, 28.40, 36.16, 38.82, 35.93, 37.82, 42.75, 49.10],
    "MASC(clean-test)\nCER": [5.76, 6.13, 6.14, 5.59, 5.63, 6.22, 7.06, 5.63, 7.24, 6.11, 6.22, 7.19, 6.36, 7.55, 8.28, 9.05, 8.73, 7.15, 8.98, 8.41, 7.76, 10.29, 10.36, 9.01, 11.92, 11.87, 16.37],
    "MASC(noisy-test)\nWER": [29.29, 30.03, 30.26, 30.85, 34.29, 32.45, 34.43, 35.64, 34.63, 34.60, 35.04, 33.24, 35.73, 37.16, 38.55, 40.79, 41.27, 50.16, 42.91, 45.95, 43.26, 50.03, 57.33, 56.36, 53.28, 64.27, 69.57],
    "MASC(noisy-test)\nCER": [10.66, 11.27, 11.18, 11.28, 11.07, 12.23, 12.22, 11.02, 12.89, 12.32, 13.57, 11.92, 12.52, 13.93, 15.49, 16.31, 16.44, 15.62, 17.49, 18.72, 14.89, 20.09, 19.76, 19.43, 21.93, 24.17, 30.17],
    "MGB-2 WER": [14.13, 14.22, 15.34, 13.09, 17.20, 16.58, 16.03, 19.69, 16.26, 18.96, 20.43, 20.23, 19.96, 17.75, 25.17, 24.28, 22.56, 37.51, 29.32, 24.94, 26.85, 30.68, 39.16, 48.64, 40.66, 56.29, 64.37],
    "MGB-2 CER": [7.10, 7.06, 7.56, 6.20, 6.87, 7.86, 7.41, 7.46, 7.74, 8.28, 9.78, 9.37, 8.56, 8.34, 13.48, 12.10, 10.46, 11.07, 14.82, 9.87, 10.03, 11.36, 13.48, 15.56, 19.39, 20.44, 26.56],
    "Casablanca\nWER": [56.46, 60.27, 60.68, 62.55, 68.90, 64.64, 66.30, 71.13, 71.81, 67.58, 67.02, 66.25, 68.32, 73.79, 71.01, 72.18, 77.52, 76.53, 75.44, 80.80, 75.35, 81.30, 87.95, 87.64, 87.88, 92.72, 93.68],
    "Casablanca\nCER": [23.96, 28.06, 28.02, 28.53, 32.97, 29.61, 30.64, 30.50, 35.04, 34.59, 35.60, 29.85, 35.08, 34.83, 36.00, 35.71, 39.43, 36.03, 38.12, 49.77, 34.29, 45.31, 40.41, 46.12, 39.99, 46.33, 52.36],
}

original_df = pd.DataFrame(results)
original_df.sort_values(by="Average WER⬇️", inplace=True)

TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

LEADERBOARD_CSS = """
html, body {
  overflow-y: auto !important;
}

#leaderboard-table th .header-content {
    min-width: 150px;
    white-space: nowrap;
}
"""

def request_model(model_text):
    return styled_message("🤗 Please launch a discussion in our GitHub repo, thank you. 🤗")

with gr.Blocks(fill_width=False, fill_height=False, css=LEADERBOARD_CSS) as demo:
    gr.HTML(BANNER, elem_id="banner")
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.Dataframe(
                value=original_df,
                datatype=TYPES,
                elem_id="leaderboard-table",
                interactive=False,
                visible=True,
            )

        with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
            with gr.Column():
                gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text")
                model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
                mdw_submission_result = gr.Markdown()
                btn_submit = gr.Button(value="🚀 Request")
                btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)

    gr.Markdown(UPDATES, elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            gr.Textbox(
                value=CITATION_BUTTON_TEXT, lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                show_copy_button=True,
            )

demo.launch(allowed_paths=["banner.png"], ssr_mode=False)