Spaces:
Sleeping
Sleeping
| # app.py | |
| # Main Gradio dashboard for IndoNLP Space | |
| import os | |
| import json | |
| import traceback | |
| from typing import Optional | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| from models import sentiment as sentiment_mod | |
| from models import ner as ner_mod | |
| from models import qa as qa_mod | |
| from models import summarization as summ_mod | |
| from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard | |
| # Ensure metrics dir | |
| os.makedirs("metrics", exist_ok=True) | |
| LEADERBOARD_PATH = "metrics/leaderboard.json" | |
| leaderboard = load_leaderboard(LEADERBOARD_PATH) | |
| # Utility to create a plot from leaderboard | |
| def leaderboard_plot(metric: str = "f1"): | |
| if leaderboard.empty: | |
| return px.bar(title="No leaderboard data") | |
| if metric not in leaderboard.columns: | |
| metric = "f1" | |
| fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric, | |
| title=f"Leaderboard by {metric.upper()}") | |
| return fig | |
| # Gradio UI | |
| with gr.Blocks(title="IndoNLP Dashboard") as demo: | |
| gr.Markdown("# ๐ฎ๐ฉ IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.") | |
| with gr.Tab("Overview"): | |
| gr.Markdown("## Leaderboard & Comparison") | |
| metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1") | |
| leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1")) | |
| metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el]) | |
| with gr.Tab("Try Models"): | |
| gr.Markdown("### Interactive Inference") | |
| task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task") | |
| model = gr.Dropdown(choices=[], label="Model") | |
| input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text") | |
| qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question") | |
| run_btn = gr.Button("Run") | |
| output = gr.HTML(label="Output") | |
| # update model choices per task | |
| def update_models_for_task(t): | |
| if t == "sentiment": | |
| return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) | |
| if t == "ner": | |
| return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) | |
| if t == "qa": | |
| return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True) | |
| if t == "summarization": | |
| return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False) | |
| return gr.update(choices=[]), gr.update(visible=False) | |
| task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question]) | |
| def run_model(task, model_key, text, question): | |
| try: | |
| if task == "sentiment": | |
| res = sentiment_mod.predict(text, model_key) | |
| # format as HTML | |
| rows = [f"<li>{label}: {score:.4f}</li>" for label, score in res] | |
| return f"<b>Sentiment (top scores):</b><ul>{''.join(rows)}</ul>" | |
| if task == "ner": | |
| ents = ner_mod.predict(text, model_key) | |
| # render token-highlighted HTML | |
| html = ner_mod.render_ner_html(text, ents) | |
| return html | |
| if task == "qa": | |
| ans = qa_mod.predict(text, question, model_key) | |
| return f"<b>Answer:</b> {ans.get('answer')} <br/><small>score: {ans.get('score'):.4f}</small>" | |
| if task == "summarization": | |
| summ = summ_mod.predict(text, model_key) | |
| return f"<b>Summary:</b><p>{summ}</p>" | |
| return "Unsupported task" | |
| except Exception as e: | |
| return f"<pre>Error: {e}\n{traceback.format_exc()}</pre>" | |
| run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output]) | |
| with gr.Tab("Benchmark / Evaluate"): | |
| gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.") | |
| file_in = gr.File(label="Upload file") | |
| bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment") | |
| bench_model = gr.Dropdown(choices=[], label="Model") | |
| run_eval = gr.Button("Evaluate") | |
| eval_output = gr.JSON() | |
| cm_plot = gr.Plot() | |
| def update_models_for_eval(t): | |
| if t == "sentiment": | |
| return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]) | |
| if t == "ner": | |
| return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]) | |
| if t == "qa": | |
| return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]) | |
| return gr.update(choices=[]) | |
| bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model]) | |
| def run_evaluation(file, task, model_key): | |
| if file is None: | |
| return {"error": "No file uploaded"}, px.imshow([[0]]) | |
| try: | |
| if task == "sentiment": | |
| df = pd.read_csv(file.name if hasattr(file, 'name') else file.file) | |
| res = evaluate_classification(df, model_key) | |
| # update leaderboard | |
| new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']} | |
| # append and save | |
| global leaderboard | |
| leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True) | |
| save_leaderboard(leaderboard, LEADERBOARD_PATH) | |
| # build confusion matrix plot | |
| cm = res.get('confusion_matrix') | |
| labels = res.get('labels') | |
| fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix") | |
| return res, fig | |
| if task == "ner": | |
| # accept JSONL of entities or CoNLL TSV; evaluate_ner will parse | |
| res = evaluate_ner(file, model_key) | |
| return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True) | |
| if task == "qa": | |
| res = evaluate_qa(file, model_key) | |
| return res, px.imshow([[res.get('f1',0)]], title="QA F1") | |
| return {"error": "Unsupported task"}, px.imshow([[0]]) | |
| except Exception as e: | |
| return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]]) | |
| run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot]) | |
| with gr.Tab("About & Debugging"): | |
| gr.Markdown("## Notes & Debugging") | |
| gr.Markdown(""" | |
| - If a model fails to load, check model slug and Space `HF_TOKEN` setting. | |
| - Large models require GPU in Space settings to avoid OOM. | |
| - For private models set `HF_TOKEN` as a secret in Space settings. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |