Spaces:

darisdzakwanhoesien
/

indo_nlp

Running

App Files Files Community

darisdzakwanhoesien commited on Oct 17

Commit

30016dc

verified ·

1 Parent(s): 926f6e5

Create app.py

Browse files

Files changed (1) hide show

app.py +144 -0

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# app.py
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoModelForMaskedLM, AutoModel, AutoModelForQuestionAnswering
+import threading
+# --- configuration: model ids you might want to swap ---
+MODELS = {
+    "fill-mask": "indolem/indobert-base-uncased",
+    # a sample fine-tuned classifier on IndoNLU (replace with your preferred HF model)
+    "sentiment": "ayameRushia/indobert-base-uncased-finetuned-indonlu-smsa",
+    # example NER model fine-tuned from Indonesian BERT (replace if you prefer another)
+    "ner": "ageng-anugrah/indobert-large-p2-finetuned-ner",
+    # QA (if you have a QA model); you can also use a general model but best to use dedicated fine-tuned QA model
+    "qa": "indobenchmark/indobert-base-p1",
+    # embeddings / feature extraction: use a model that supports sentence embeddings or feature extraction
+    "embeddings": "indobenchmark/indobert-base-p1",
+}
+# pipeline cache
+PIPELINES = {}
+PIPELINE_LOCK = threading.Lock()
+def get_pipeline(task: str):
+    """Lazy-load pipeline for a given task. Thread-safe."""
+    with PIPELINE_LOCK:
+        if task in PIPELINES:
+            return PIPELINES[task]
+        if task == "fill-mask":
+            p = pipeline("fill-mask", model=MODELS["fill-mask"], tokenizer=MODELS["fill-mask"])
+        elif task == "sentiment":
+            p = pipeline("text-classification", model=MODELS["sentiment"], tokenizer=MODELS["sentiment"], return_all_scores=True)
+        elif task == "ner":
+            # aggregation_strategy avoids repeated tokens; set to "simple" or None to see raw results
+            p = pipeline("token-classification", model=MODELS["ner"], tokenizer=MODELS["ner"], aggregation_strategy="simple")
+        elif task == "qa":
+            # For QA we return an extractive QA pipeline
+            p = pipeline("question-answering", model=MODELS["qa"], tokenizer=MODELS["qa"])
+        elif task == "embeddings":
+            # Use feature-extraction pipeline (returns token embeddings; we'll average to produce sentence-level)
+            p = pipeline("feature-extraction", model=MODELS["embeddings"], tokenizer=MODELS["embeddings"])
+        else:
+            raise ValueError(f"Unknown task: {task}")
+        PIPELINES[task] = p
+        return p
+# --- functions for each task ---
+def run_fill_mask(text):
+    p = get_pipeline("fill-mask")
+    # The fill-mask pipeline expects a special mask token like <mask> or [MASK] depending on model/tokenizer.
+    # We'll try both: if the chosen model uses [MASK], user should include it; otherwise replace token.
+    try:
+        outputs = p(text)
+    except Exception as e:
+        return f"Error running fill-mask: {e}"
+    # Format results
+    return "\n".join([f"{o['sequence']} (score: {o['score']:.4f})" for o in outputs])
+def run_sentiment(text):
+    p = get_pipeline("sentiment")
+    try:
+        outputs = p(text)
+    except Exception as e:
+        return f"Error running sentiment: {e}"
+    # outputs is list of dicts with label/score
+    return "\n".join([f"{o['label']}: {o['score']:.4f}" for o in outputs])
+def run_ner(text):
+    p = get_pipeline("ner")
+    try:
+        ents = p(text)
+    except Exception as e:
+        return f"Error running NER: {e}"
+    if not ents:
+        return "No entities found."
+    # Format: label (span): text
+    lines = []
+    for e in ents:
+        label = e.get("entity_group", e.get("entity"))
+        span = e.get("word", "")
+        score = e.get("score", 0.0)
+        lines.append(f"{label} ({score:.3f}): {span}")
+    return "\n".join(lines)
+def run_qa(context, question):
+    p = get_pipeline("qa")
+    try:
+        out = p(question=question, context=context)
+    except Exception as e:
+        return f"Error running QA: {e}"
+    return f"Answer: {out.get('answer')} (score: {out.get('score', 0):.4f})"
+def run_embeddings(text):
+    p = get_pipeline("embeddings")
+    try:
+        feats = p(text)  # returns nested token embeddings
+    except Exception as e:
+        return f"Error extracting embeddings: {e}"
+    # average token embeddings to get sentence vector
+    import numpy as np
+    arr = np.array(feats)  # shape: (1, seq_len, hidden)
+    sent = arr.mean(axis=1)  # (1, hidden)
+    vec = sent[0].tolist()
+    # For display keep a short preview
+    preview = ", ".join([f"{v:.4f}" for v in vec[:8]]) + ("..." if len(vec) > 8 else "")
+    return f"Embedding (dim {len(vec)}): [{preview}]"
+# --- Gradio UI ---
+with gr.Blocks(title="Indonesian NLP Playground (IndoBERT / IndoLEM / IndoNLU)") as demo:
+    gr.Markdown("## Indonesian NLP Playground\nChoose a task, enter Indonesian text, and run IndoBERT / IndoLEM-powered models.\n\nModels are loaded lazily to save memory. You can replace model ids in the `MODELS` dict.")
+    with gr.Row():
+        task = gr.Dropdown(choices=["fill-mask", "sentiment", "ner", "qa", "embeddings"], value="sentiment", label="Task")
+    input_text = gr.Textbox(lines=4, placeholder="Type Indonesian text here...", label="Input Text")
+    # extra inputs for QA
+    qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question (QA only)")
+    output = gr.Textbox(lines=10, label="Output")
+    def on_task_change(t):
+        qa_question.visible = (t == "qa")
+        return gr.update(visible=(t == "qa"))
+    task.change(on_task_change, inputs=[task], outputs=[qa_question])
+    def run(selected_task, text, question):
+        if selected_task == "fill-mask":
+            return run_fill_mask(text)
+        if selected_task == "sentiment":
+            return run_sentiment(text)
+        if selected_task == "ner":
+            return run_ner(text)
+        if selected_task == "qa":
+            return run_qa(text, question)
+        if selected_task == "embeddings":
+            return run_embeddings(text)
+        return "Unknown task."
+    btn = gr.Button("Run")
+    btn.click(run, inputs=[task, input_text, qa_question], outputs=[output])
+if __name__ == "__main__":
+    demo.launch()