darisdzakwanhoesien commited on
Commit
30016dc
·
verified ·
1 Parent(s): 926f6e5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoModelForMaskedLM, AutoModel, AutoModelForQuestionAnswering
4
+ import threading
5
+
6
+ # --- configuration: model ids you might want to swap ---
7
+ MODELS = {
8
+ "fill-mask": "indolem/indobert-base-uncased",
9
+ # a sample fine-tuned classifier on IndoNLU (replace with your preferred HF model)
10
+ "sentiment": "ayameRushia/indobert-base-uncased-finetuned-indonlu-smsa",
11
+ # example NER model fine-tuned from Indonesian BERT (replace if you prefer another)
12
+ "ner": "ageng-anugrah/indobert-large-p2-finetuned-ner",
13
+ # QA (if you have a QA model); you can also use a general model but best to use dedicated fine-tuned QA model
14
+ "qa": "indobenchmark/indobert-base-p1",
15
+ # embeddings / feature extraction: use a model that supports sentence embeddings or feature extraction
16
+ "embeddings": "indobenchmark/indobert-base-p1",
17
+ }
18
+
19
+ # pipeline cache
20
+ PIPELINES = {}
21
+ PIPELINE_LOCK = threading.Lock()
22
+
23
+ def get_pipeline(task: str):
24
+ """Lazy-load pipeline for a given task. Thread-safe."""
25
+ with PIPELINE_LOCK:
26
+ if task in PIPELINES:
27
+ return PIPELINES[task]
28
+
29
+ if task == "fill-mask":
30
+ p = pipeline("fill-mask", model=MODELS["fill-mask"], tokenizer=MODELS["fill-mask"])
31
+ elif task == "sentiment":
32
+ p = pipeline("text-classification", model=MODELS["sentiment"], tokenizer=MODELS["sentiment"], return_all_scores=True)
33
+ elif task == "ner":
34
+ # aggregation_strategy avoids repeated tokens; set to "simple" or None to see raw results
35
+ p = pipeline("token-classification", model=MODELS["ner"], tokenizer=MODELS["ner"], aggregation_strategy="simple")
36
+ elif task == "qa":
37
+ # For QA we return an extractive QA pipeline
38
+ p = pipeline("question-answering", model=MODELS["qa"], tokenizer=MODELS["qa"])
39
+ elif task == "embeddings":
40
+ # Use feature-extraction pipeline (returns token embeddings; we'll average to produce sentence-level)
41
+ p = pipeline("feature-extraction", model=MODELS["embeddings"], tokenizer=MODELS["embeddings"])
42
+ else:
43
+ raise ValueError(f"Unknown task: {task}")
44
+
45
+ PIPELINES[task] = p
46
+ return p
47
+
48
+ # --- functions for each task ---
49
+
50
+ def run_fill_mask(text):
51
+ p = get_pipeline("fill-mask")
52
+ # The fill-mask pipeline expects a special mask token like <mask> or [MASK] depending on model/tokenizer.
53
+ # We'll try both: if the chosen model uses [MASK], user should include it; otherwise replace token.
54
+ try:
55
+ outputs = p(text)
56
+ except Exception as e:
57
+ return f"Error running fill-mask: {e}"
58
+ # Format results
59
+ return "\n".join([f"{o['sequence']} (score: {o['score']:.4f})" for o in outputs])
60
+
61
+ def run_sentiment(text):
62
+ p = get_pipeline("sentiment")
63
+ try:
64
+ outputs = p(text)
65
+ except Exception as e:
66
+ return f"Error running sentiment: {e}"
67
+ # outputs is list of dicts with label/score
68
+ return "\n".join([f"{o['label']}: {o['score']:.4f}" for o in outputs])
69
+
70
+ def run_ner(text):
71
+ p = get_pipeline("ner")
72
+ try:
73
+ ents = p(text)
74
+ except Exception as e:
75
+ return f"Error running NER: {e}"
76
+ if not ents:
77
+ return "No entities found."
78
+ # Format: label (span): text
79
+ lines = []
80
+ for e in ents:
81
+ label = e.get("entity_group", e.get("entity"))
82
+ span = e.get("word", "")
83
+ score = e.get("score", 0.0)
84
+ lines.append(f"{label} ({score:.3f}): {span}")
85
+ return "\n".join(lines)
86
+
87
+ def run_qa(context, question):
88
+ p = get_pipeline("qa")
89
+ try:
90
+ out = p(question=question, context=context)
91
+ except Exception as e:
92
+ return f"Error running QA: {e}"
93
+ return f"Answer: {out.get('answer')} (score: {out.get('score', 0):.4f})"
94
+
95
+ def run_embeddings(text):
96
+ p = get_pipeline("embeddings")
97
+ try:
98
+ feats = p(text) # returns nested token embeddings
99
+ except Exception as e:
100
+ return f"Error extracting embeddings: {e}"
101
+ # average token embeddings to get sentence vector
102
+ import numpy as np
103
+ arr = np.array(feats) # shape: (1, seq_len, hidden)
104
+ sent = arr.mean(axis=1) # (1, hidden)
105
+ vec = sent[0].tolist()
106
+ # For display keep a short preview
107
+ preview = ", ".join([f"{v:.4f}" for v in vec[:8]]) + ("..." if len(vec) > 8 else "")
108
+ return f"Embedding (dim {len(vec)}): [{preview}]"
109
+
110
+ # --- Gradio UI ---
111
+
112
+ with gr.Blocks(title="Indonesian NLP Playground (IndoBERT / IndoLEM / IndoNLU)") as demo:
113
+ gr.Markdown("## Indonesian NLP Playground\nChoose a task, enter Indonesian text, and run IndoBERT / IndoLEM-powered models.\n\nModels are loaded lazily to save memory. You can replace model ids in the `MODELS` dict.")
114
+ with gr.Row():
115
+ task = gr.Dropdown(choices=["fill-mask", "sentiment", "ner", "qa", "embeddings"], value="sentiment", label="Task")
116
+ input_text = gr.Textbox(lines=4, placeholder="Type Indonesian text here...", label="Input Text")
117
+ # extra inputs for QA
118
+ qa_question = gr.Textbox(lines=2, placeholder="Question (for QA)", visible=False, label="Question (QA only)")
119
+ output = gr.Textbox(lines=10, label="Output")
120
+
121
+ def on_task_change(t):
122
+ qa_question.visible = (t == "qa")
123
+ return gr.update(visible=(t == "qa"))
124
+
125
+ task.change(on_task_change, inputs=[task], outputs=[qa_question])
126
+
127
+ def run(selected_task, text, question):
128
+ if selected_task == "fill-mask":
129
+ return run_fill_mask(text)
130
+ if selected_task == "sentiment":
131
+ return run_sentiment(text)
132
+ if selected_task == "ner":
133
+ return run_ner(text)
134
+ if selected_task == "qa":
135
+ return run_qa(text, question)
136
+ if selected_task == "embeddings":
137
+ return run_embeddings(text)
138
+ return "Unknown task."
139
+
140
+ btn = gr.Button("Run")
141
+ btn.click(run, inputs=[task, input_text, qa_question], outputs=[output])
142
+
143
+ if __name__ == "__main__":
144
+ demo.launch()