Spaces:

darisdzakwanhoesien
/

indo_nlp

Sleeping

indo_nlp / app.py

darisdzakwanhoesien2

Update app.py and evaluate.py with latest changes

0eb435a about 1 month ago

7.7 kB

	# app.py
	# Main Gradio dashboard for IndoNLP Space

	import os
	import json
	import traceback
	from typing import Optional

	import gradio as gr
	import pandas as pd
	import plotly.express as px

	from models import sentiment as sentiment_mod
	from models import ner as ner_mod
	from models import qa as qa_mod
	from models import summarization as summ_mod
	from metrics.evaluate import evaluate_classification, evaluate_ner, evaluate_qa, load_leaderboard, save_leaderboard

	# Ensure metrics dir
	os.makedirs("metrics", exist_ok=True)

	LEADERBOARD_PATH = "metrics/leaderboard.json"
	leaderboard = load_leaderboard(LEADERBOARD_PATH)

	# Utility to create a plot from leaderboard
	def leaderboard_plot(metric: str = "f1"):
	if leaderboard.empty:
	return px.bar(title="No leaderboard data")
	if metric not in leaderboard.columns:
	metric = "f1"
	fig = px.bar(leaderboard.sort_values(metric, ascending=False), x="model", y=metric, color="task", text=metric,
	title=f"Leaderboard by {metric.upper()}")
	return fig

	# Gradio UI
	with gr.Blocks(title="IndoNLP Dashboard") as demo:
	gr.Markdown("# 🇮🇩 IndoNLP Dashboard\nTry Indonesian NLP models, run benchmarks, and visualize metrics.")

	with gr.Tab("Overview"):
	gr.Markdown("## Leaderboard & Comparison")
	metric_choice = gr.Radio(choices=["f1", "accuracy", "precision", "recall"], value="f1")
	leaderboard_plot_el = gr.Plot(value=leaderboard_plot("f1"))
	metric_choice.change(lambda m: leaderboard_plot(m), inputs=[metric_choice], outputs=[leaderboard_plot_el])

	with gr.Tab("Try Models"):
	gr.Markdown("### Interactive Inference")
	task = gr.Dropdown(choices=["sentiment", "ner", "qa", "summarization"], value="sentiment", label="Task")
	model = gr.Dropdown(choices=[], label="Model")
	input_text = gr.Textbox(lines=6, placeholder="Type Indonesian text here...", label="Input Text")
	qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question")
	run_btn = gr.Button("Run")
	output = gr.HTML(label="Output")

	# update model choices per task
	def update_models_for_task(t):
	if t == "sentiment":
	return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
	if t == "ner":
	return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
	if t == "qa":
	return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=True)
	if t == "summarization":
	return gr.update(choices=list(summ_mod.AVAILABLE_MODELS.keys()), value=list(summ_mod.AVAILABLE_MODELS.keys())[0]), gr.update(visible=False)
	return gr.update(choices=[]), gr.update(visible=False)

	task.change(lambda t: update_models_for_task(t), inputs=[task], outputs=[model, qa_question])

	def run_model(task, model_key, text, question):
	try:
	if task == "sentiment":
	res = sentiment_mod.predict(text, model_key)
	# format as HTML
	rows = [f"<li>{label}: {score:.4f}</li>" for label, score in res]
	return f"<b>Sentiment (top scores):</b><ul>{''.join(rows)}</ul>"
	if task == "ner":
	ents = ner_mod.predict(text, model_key)
	# render token-highlighted HTML
	html = ner_mod.render_ner_html(text, ents)
	return html
	if task == "qa":
	ans = qa_mod.predict(text, question, model_key)
	return f"<b>Answer:</b> {ans.get('answer')} <br/><small>score: {ans.get('score'):.4f}</small>"
	if task == "summarization":
	summ = summ_mod.predict(text, model_key)
	return f"<b>Summary:</b><p>{summ}</p>"
	return "Unsupported task"
	except Exception as e:
	return f"<pre>Error: {e}\n{traceback.format_exc()}</pre>"

	run_btn.click(run_model, inputs=[task, model, input_text, qa_question], outputs=[output])

	with gr.Tab("Benchmark / Evaluate"):
	gr.Markdown("Upload CSV (classification): columns `text`,`label`. For NER provide CoNLL-like TSV or JSONL.\nFor QA provide JSONL with `context`,`question`,`answers`.")
	file_in = gr.File(label="Upload file")
	bench_task = gr.Dropdown(choices=["sentiment", "ner", "qa"], value="sentiment")
	bench_model = gr.Dropdown(choices=[], label="Model")
	run_eval = gr.Button("Evaluate")
	eval_output = gr.JSON()
	cm_plot = gr.Plot()

	def update_models_for_eval(t):
	if t == "sentiment":
	return gr.update(choices=list(sentiment_mod.AVAILABLE_MODELS.keys()), value=list(sentiment_mod.AVAILABLE_MODELS.keys())[0])
	if t == "ner":
	return gr.update(choices=list(ner_mod.AVAILABLE_MODELS.keys()), value=list(ner_mod.AVAILABLE_MODELS.keys())[0])
	if t == "qa":
	return gr.update(choices=list(qa_mod.AVAILABLE_MODELS.keys()), value=list(qa_mod.AVAILABLE_MODELS.keys())[0])
	return gr.update(choices=[])

	bench_task.change(lambda t: update_models_for_eval(t), inputs=[bench_task], outputs=[bench_model])

	def run_evaluation(file, task, model_key):
	if file is None:
	return {"error": "No file uploaded"}, px.imshow([[0]])
	try:
	if task == "sentiment":
	df = pd.read_csv(file.name if hasattr(file, 'name') else file.file)
	res = evaluate_classification(df, model_key)
	# update leaderboard
	new_entry = {"task": "Sentiment", "model": model_key, **res['metrics']}
	# append and save
	global leaderboard
	leaderboard = pd.concat([leaderboard, pd.DataFrame([new_entry])], ignore_index=True)
	save_leaderboard(leaderboard, LEADERBOARD_PATH)
	# build confusion matrix plot
	cm = res.get('confusion_matrix')
	labels = res.get('labels')
	fig = px.imshow(cm, x=labels, y=labels, text_auto=True, title="Confusion Matrix")
	return res, fig
	if task == "ner":
	# accept JSONL of entities or CoNLL TSV; evaluate_ner will parse
	res = evaluate_ner(file, model_key)
	return res, px.imshow(res.get('confusion_matrix', [[0]]), text_auto=True)
	if task == "qa":
	res = evaluate_qa(file, model_key)
	return res, px.imshow([[res.get('f1',0)]], title="QA F1")
	return {"error": "Unsupported task"}, px.imshow([[0]])
	except Exception as e:
	return {"error": str(e), "traceback": traceback.format_exc()}, px.imshow([[0]])

	run_eval.click(run_evaluation, inputs=[file_in, bench_task, bench_model], outputs=[eval_output, cm_plot])

	with gr.Tab("About & Debugging"):
	gr.Markdown("## Notes & Debugging")
	gr.Markdown("""
	- If a model fails to load, check model slug and Space `HF_TOKEN` setting.
	- Large models require GPU in Space settings to avoid OOM.
	- For private models set `HF_TOKEN` as a secret in Space settings.
	""")

	if __name__ == "__main__":
	demo.launch()