indo_nlp / app.py
darisdzakwanhoesien2
Update app.py and evaluate.py with latest changes
0eb435a
# app.py
# Main Gradio dashboard for IndoNLP Space
import os
import json
import traceback
from typing import Optional
import gradio as gr
import pandas as pd
import plotly.express as px
from models import sentiment as sentiment_mod
from models import ner as ner_mod
from models import qa as qa_mod
from models import summarization as summ_mod
from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard
# Ensure metrics dir
os.makedirs("metrics", exist_ok=True)
LEADERBOARD_PATH = "metrics/leaderboard.json"
leaderboard = load_leaderboard(LEADERBOARD_PATH)
# Utility to create a plot from leaderboard
def leaderboard_plot(metric: str = "f1"):
if leaderboard.empty:
return px.bar(title="No leaderboard data")
if metric not in leaderboard.columns:
metric = "f1"
fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric,
title=f"Leaderboard by {metric.upper()}")
return fig
# Gradio UI
with gr.Blocks(title="IndoNLP Dashboard") as demo:
gr.Markdown("# ๐Ÿ‡ฎ๐Ÿ‡ฉ IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.")
with gr.Tab("Overview"):
gr.Markdown("## Leaderboard & Comparison")
metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1")
leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1"))
metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el])
with gr.Tab("Try Models"):
gr.Markdown("### Interactive Inference")
task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task")
model = gr.Dropdown(choices=[], label="Model")
input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text")
qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question")
run_btn = gr.Button("Run")
output = gr.HTML(label="Output")
# update model choices per task
def update_models_for_task(t):
if t == "sentiment":
return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
if t == "ner":
return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
if t == "qa":
return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True)
if t == "summarization":
return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
return gr.update(choices=[]), gr.update(visible=False)
task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question])
def run_model(task, model_key, text, question):
try:
if task == "sentiment":
res = sentiment_mod.predict(text, model_key)
# format as HTML
rows = [f"<li>{label}: {score:.4f}</li>" for label, score in res]
return f"<b>Sentiment (top scores):</b><ul>{''.join(rows)}</ul>"
if task == "ner":
ents = ner_mod.predict(text, model_key)
# render token-highlighted HTML
html = ner_mod.render_ner_html(text, ents)
return html
if task == "qa":
ans = qa_mod.predict(text, question, model_key)
return f"<b>Answer:</b> {ans.get('answer')} <br/><small>score: {ans.get('score'):.4f}</small>"
if task == "summarization":
summ = summ_mod.predict(text, model_key)
return f"<b>Summary:</b><p>{summ}</p>"
return "Unsupported task"
except Exception as e:
return f"<pre>Error: {e}\n{traceback.format_exc()}</pre>"
run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output])
with gr.Tab("Benchmark / Evaluate"):
gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.")
file_in = gr.File(label="Upload file")
bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment")
bench_model = gr.Dropdown(choices=[], label="Model")
run_eval = gr.Button("Evaluate")
eval_output = gr.JSON()
cm_plot = gr.Plot()
def update_models_for_eval(t):
if t == "sentiment":
return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0])
if t == "ner":
return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0])
if t == "qa":
return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0])
return gr.update(choices=[])
bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model])
def run_evaluation(file, task, model_key):
if file is None:
return {"error": "No file uploaded"}, px.imshow([[0]])
try:
if task == "sentiment":
df = pd.read_csv(file.name if hasattr(file, 'name') else file.file)
res = evaluate_classification(df, model_key)
# update leaderboard
new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']}
# append and save
global leaderboard
leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True)
save_leaderboard(leaderboard, LEADERBOARD_PATH)
# build confusion matrix plot
cm = res.get('confusion_matrix')
labels = res.get('labels')
fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix")
return res, fig
if task == "ner":
# accept JSONL of entities or CoNLL TSV; evaluate_ner will parse
res = evaluate_ner(file, model_key)
return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True)
if task == "qa":
res = evaluate_qa(file, model_key)
return res, px.imshow([[res.get('f1',0)]], title="QA F1")
return {"error": "Unsupported task"}, px.imshow([[0]])
except Exception as e:
return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]])
run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot])
with gr.Tab("About & Debugging"):
gr.Markdown("## Notes & Debugging")
gr.Markdown("""
- If a model fails to load, check model slug and Space `HF_TOKEN` setting.
- Large models require GPU in Space settings to avoid OOM.
- For private models set `HF_TOKEN` as a secret in Space settings.
""")
if __name__ == "__main__":
demo.launch()