Finetuned-NLLB / app.py
ayymen's picture
Update app.py (#1)
d55ce43 verified
import gradio as gr
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
import torch
MODEL_NAME = "Tamazight-NLP/NLLB-200-600M-Tamazight-All-Data-3-epoch"
NLLB_LANG_MAPPING = {
"English": "eng_Latn",
"Standard Moroccan Tamazight": "tzm_Tfng",
"Tachelhit/Central Atlas Tamazight": "taq_Tfng",
"Tachelhit/Central Atlas Tamazight (Latin)": "taq_Latn",
"Tarifit": "kab_Tfng",
"Tarifit (Latin)": "kab_Latn",
"Moroccan Darija": "ary_Arab",
"Modern Standard Arabic": "arb_Arab",
"Catalan": "cat_Latn",
"Spanish": "spa_Latn",
"French": "fra_Latn",
"German": "deu_Latn",
"Dutch": "nld_Latn",
"Russian": "rus_Cyrl",
"Italian": "ita_Latn",
"Turkish": "tur_Latn",
"Esperanto": "epo_Latn"
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = NllbTokenizer.from_pretrained(MODEL_NAME)
def translate(text, source_lang="English", target_lang="Tachelhit/Central Atlas Tamazight",
max_length=238, num_beams=4, repetition_penalty=1.0):
"""
Translate multi-line text while preserving line breaks.
Each line is translated independently.
"""
translations = []
for line in text.split("\n"):
if line.strip() == "":
translations.append("") # preserve empty lines
else:
tokenizer.src_lang = NLLB_LANG_MAPPING[source_lang]
inputs = tokenizer(line, return_tensors="pt").to(model.device)
translated_tokens = model.generate(
**inputs,
forced_bos_token_id=tokenizer.convert_tokens_to_ids(NLLB_LANG_MAPPING[target_lang]),
max_length=max_length,
num_beams=num_beams,
repetition_penalty=float(repetition_penalty),
)
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
translations.append(translation)
return "\n".join(translations)
gradio_ui= gr.Interface(
fn=translate,
title="NLLB Tamazight Translation Demo",
inputs= [
gr.components.Textbox(label="Text", lines=4, placeholder="ⵙⵙⴽⵛⵎ ⴰⴹⵕⵉⵚ...\nEnter text to translate..."),
gr.components.Dropdown(label="Source Language", choices=list(NLLB_LANG_MAPPING.keys()), value="English"),
gr.components.Dropdown(label="Target Language", choices=list(NLLB_LANG_MAPPING.keys()), value="Standard Moroccan Tamazight"),
gr.components.Slider(8, 400, value=238, step=8, label="Max Length (in tokens). Increase in case the output looks truncated."),
gr.components.Slider(1, 25, value=4, step=1, label="Number of beams. Higher values might improve translation accuracy at the cost of speed."),
gr.components.Slider(1, 10, value=1.0, step=0.1, label="Repetition penalty."),
],
outputs=gr.components.Textbox(label="Translated text", lines=4)
)
gradio_ui.launch()