nexusbert commited on
Commit
c162be1
·
1 Parent(s): f28c3d1
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. app.py +63 -117
Dockerfile CHANGED
@@ -46,7 +46,7 @@ RUN python -c "from transformers import pipeline; pipeline('text-to-speech', mod
46
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
47
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
48
 
49
- # NCAIR1 models will be downloaded at runtime when HF_TOKEN is available
50
 
51
  # Copy project files
52
  COPY . .
 
46
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-eng')" \
47
  && python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-yor')"
48
 
49
+ # Models will be downloaded at runtime when HF_TOKEN is available
50
 
51
  # Copy project files
52
  COPY . .
app.py CHANGED
@@ -39,7 +39,6 @@ app.add_middleware(
39
 
40
  ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
41
  tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
42
- natlas_tokenizer, natlas_model = None, None
43
 
44
  asr_models = {
45
  "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
@@ -49,7 +48,7 @@ asr_models = {
49
  }
50
 
51
  def load_models():
52
- global tts_ha, tts_en, tts_yo, tts_ig, natlas_tokenizer, natlas_model
53
  device = 0 if torch.cuda.is_available() else -1
54
  hf_token = os.getenv("HF_TOKEN")
55
  if hf_token:
@@ -60,8 +59,7 @@ def load_models():
60
  else:
61
  logger.info("HF_TOKEN is set and ready for authenticated model access.")
62
 
63
- logger.info("Loading N-ATLaS language identification model...")
64
- _load_natlas()
65
 
66
  logger.info("Loading TTS models...")
67
  try:
@@ -185,130 +183,78 @@ def get_ai_response(text: str) -> str:
185
  logger.error(f"AI request error: {e}")
186
  return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
187
 
 
188
  HAUSA_WORDS = [
189
- "aikin","manoma","gona","amfanin","yanayi","tsaba","fasaha","bisa","noman","shuka",
190
- "daji","rani","damina","amfani","bidi'a","noma","bashi","manure","tsiro","gishiri"
 
 
 
 
 
 
 
191
  ]
192
 
193
  YORUBA_WORDS = [
194
- "ilé","ọmọ","òun","awọn","agbẹ","oko","ọgbà","irugbin","àkọsílẹ","omi","ojo","àgbàlá","irọlẹ"
 
 
 
 
 
 
 
195
  ]
196
 
197
  IGBO_WORDS = [
198
- "ugbo","akụkọ","mmiri","ala","ọrụ","ncheta","ọhụrụ","ugwu","nri","ahụhụ"
 
 
 
 
 
 
 
199
  ]
200
 
201
- def _load_natlas():
202
- global natlas_tokenizer, natlas_model
203
- if natlas_tokenizer is not None and natlas_model is not None:
204
- logger.info("N-ATLaS model already loaded")
205
- return True
 
206
 
207
- hf_token = os.getenv("HF_TOKEN")
208
- if hf_token:
209
- hf_token = hf_token.strip()
210
 
211
- if not hf_token:
212
- logger.error("HF_TOKEN not available for N-ATLaS model access")
213
- return False
 
214
 
215
- try:
216
- logger.info("Loading N-ATLaS language identification model...")
217
- logger.info("Downloading model files from Hugging Face...")
218
-
219
- natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
220
- natlas_model = AutoModelForCausalLM.from_pretrained(
221
- "NCAIR1/N-ATLaS",
222
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
223
- device_map="auto" if torch.cuda.is_available() else None,
224
- token=hf_token,
225
- trust_remote_code=True,
226
- low_cpu_mem_usage=True,
227
- use_cache=True
228
- )
229
- logger.info("Successfully loaded N-ATLaS language identification model")
230
- return True
231
- except Exception as e:
232
- logger.exception(f"Failed to load N-ATLaS model: {e}")
233
- natlas_tokenizer, natlas_model = None, None
234
- return False
235
 
236
  def detect_language(text: str) -> str:
 
 
 
237
  logger.info(f"Detecting language for text: '{text[:50]}...'")
238
-
239
- if not _load_natlas():
240
- logger.warning("N-ATLaS model not available, falling back to keyword detection")
241
- text_lower = text.lower()
242
- if any(word in text_lower for word in HAUSA_WORDS):
243
- logger.info("Keyword detection: Hausa")
244
- return "ha"
245
- elif any(word in text_lower for word in YORUBA_WORDS):
246
- logger.info("Keyword detection: Yoruba")
247
- return "yo"
248
- elif any(word in text_lower for word in IGBO_WORDS):
249
- logger.info("Keyword detection: Igbo")
250
- return "ig"
251
- else:
252
- logger.info("Keyword detection: English (default)")
253
- return "en"
254
-
255
- try:
256
- logger.info("Using N-ATLaS for language detection")
257
- messages = [
258
- {'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
259
- {'role': 'user', 'content': f'What language is this text written in? "{text}"'}
260
- ]
261
-
262
- formatted_text = natlas_tokenizer.apply_chat_template(
263
- messages,
264
- add_generation_prompt=True,
265
- tokenize=False
266
- )
267
-
268
- input_tokens = natlas_tokenizer(formatted_text, return_tensors='pt', add_special_tokens=False)
269
- if torch.cuda.is_available():
270
- input_tokens = input_tokens.to('cuda')
271
-
272
- with torch.no_grad():
273
- outputs = natlas_model.generate(
274
- **input_tokens,
275
- max_new_tokens=10,
276
- use_cache=True,
277
- repetition_penalty=1.1,
278
- temperature=0.1,
279
- do_sample=False
280
- )
281
-
282
- response = natlas_tokenizer.batch_decode(outputs)[0]
283
- response_text = response.split(messages[1]['content'])[-1].strip().lower()
284
-
285
- logger.info(f"N-ATLaS response: '{response_text}'")
286
-
287
- if 'ha' in response_text:
288
- logger.info("N-ATLaS detection: Hausa")
289
- return "ha"
290
- elif 'yo' in response_text:
291
- logger.info("N-ATLaS detection: Yoruba")
292
- return "yo"
293
- elif 'ig' in response_text:
294
- logger.info("N-ATLaS detection: Igbo")
295
- return "ig"
296
- else:
297
- logger.info("N-ATLaS detection: English (default)")
298
- return "en"
299
-
300
- except Exception as e:
301
- logger.exception(f"Language detection failed: {e}")
302
- logger.warning("Falling back to keyword detection due to N-ATLaS error")
303
- text_lower = text.lower()
304
- if any(word in text_lower for word in HAUSA_WORDS):
305
- return "ha"
306
- elif any(word in text_lower for word in YORUBA_WORDS):
307
- return "yo"
308
- elif any(word in text_lower for word in IGBO_WORDS):
309
- return "ig"
310
- else:
311
- return "en"
312
 
313
  def text_to_speech_file(text: str) -> str:
314
  lang = detect_language(text)
@@ -365,10 +311,9 @@ async def root():
365
 
366
  @app.get("/health")
367
  async def health():
368
- natlas_status = "loaded" if natlas_tokenizer is not None and natlas_model is not None else "not_loaded"
369
  return {
370
  "message": "Farmlingua AI Speech Interface is running!",
371
- "natlas_status": natlas_status,
372
  "tts_models": {
373
  "hausa": tts_ha is not None,
374
  "english": tts_en is not None,
@@ -380,8 +325,9 @@ async def health():
380
  @app.get("/status")
381
  async def status():
382
  return {
383
- "natlas_loaded": natlas_tokenizer is not None and natlas_model is not None,
384
- "loading_message": "N-ATLaS model is loading shards, please wait..." if natlas_tokenizer is None else "N-ATLaS model is ready"
 
385
  }
386
 
387
  @app.post("/chat")
 
39
 
40
  ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
41
  tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
 
42
 
43
  asr_models = {
44
  "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
 
48
  }
49
 
50
  def load_models():
51
+ global tts_ha, tts_en, tts_yo, tts_ig
52
  device = 0 if torch.cuda.is_available() else -1
53
  hf_token = os.getenv("HF_TOKEN")
54
  if hf_token:
 
59
  else:
60
  logger.info("HF_TOKEN is set and ready for authenticated model access.")
61
 
62
+ logger.info("Using lightweight keyword-based language detection (no heavy models)")
 
63
 
64
  logger.info("Loading TTS models...")
65
  try:
 
183
  logger.error(f"AI request error: {e}")
184
  return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
185
 
186
+ # Enhanced keyword lists for language detection
187
  HAUSA_WORDS = [
188
+ # Agricultural terms
189
+ "aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
190
+ "daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
191
+ # Common Hausa words
192
+ "da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
193
+ "ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
194
+ "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
195
+ "kamar", "kusa", "nisa", "gaba", "baya", "hagu", "dama", "sama", "kasa",
196
+ "lokaci", "wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan"
197
  ]
198
 
199
  YORUBA_WORDS = [
200
+ # Agricultural terms
201
+ "ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
202
+ # Common Yoruba words
203
+ "ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
204
+ "o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
205
+ "kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
206
+ "nibi", "nibe", "nibi", "nibe", "nibi", "nibe", "nibi", "nibe",
207
+ "igba", "akoko", "ojo", "osu", "odun", "ise", "owo", "owo", "owo"
208
  ]
209
 
210
  IGBO_WORDS = [
211
+ # Agricultural terms
212
+ "ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
213
+ # Common Igbo words
214
+ "na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
215
+ "m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
216
+ "otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
217
+ "ebe", "ebe", "ebe", "ebe", "ebe", "ebe", "ebe", "ebe",
218
+ "oge", "oge", "oge", "oge", "oge", "oge", "oge", "oge"
219
  ]
220
 
221
+ def detect_language_keywords(text: str) -> str:
222
+ """
223
+ Lightweight keyword-based language detection.
224
+ Returns language code: 'ha' (Hausa), 'yo' (Yoruba), 'ig' (Igbo), 'en' (English)
225
+ """
226
+ text_lower = text.lower().strip()
227
 
228
+ if not text_lower:
229
+ return "en" # Default to English for empty text
 
230
 
231
+ # Count matches for each language
232
+ hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
233
+ yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
234
+ igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
235
 
236
+ logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
237
+
238
+ # Return language with highest count, default to English if no matches
239
+ if hausa_count > yoruba_count and hausa_count > igbo_count:
240
+ logger.info("Keyword detection: Hausa")
241
+ return "ha"
242
+ elif yoruba_count > igbo_count:
243
+ logger.info("Keyword detection: Yoruba")
244
+ return "yo"
245
+ elif igbo_count > 0:
246
+ logger.info("Keyword detection: Igbo")
247
+ return "ig"
248
+ else:
249
+ logger.info("Keyword detection: English (default)")
250
+ return "en"
 
 
 
 
 
251
 
252
  def detect_language(text: str) -> str:
253
+ """
254
+ Main language detection function using lightweight keyword-based approach.
255
+ """
256
  logger.info(f"Detecting language for text: '{text[:50]}...'")
257
+ return detect_language_keywords(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  def text_to_speech_file(text: str) -> str:
260
  lang = detect_language(text)
 
311
 
312
  @app.get("/health")
313
  async def health():
 
314
  return {
315
  "message": "Farmlingua AI Speech Interface is running!",
316
+ "language_detection": "keyword-based (lightweight)",
317
  "tts_models": {
318
  "hausa": tts_ha is not None,
319
  "english": tts_en is not None,
 
325
  @app.get("/status")
326
  async def status():
327
  return {
328
+ "language_detection": "keyword-based (lightweight)",
329
+ "status": "ready",
330
+ "message": "Using lightweight keyword-based language detection - no heavy models required"
331
  }
332
 
333
  @app.post("/chat")