nexusbert commited on
Commit
22c3401
·
1 Parent(s): 35e5ec8
Files changed (1) hide show
  1. app.py +133 -27
app.py CHANGED
@@ -241,6 +241,50 @@ def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: fl
241
  logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
242
  return chunks
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def speech_to_text(audio_data: bytes) -> str:
245
  audio_array = preprocess_audio_ffmpeg(audio_data)
246
 
@@ -254,22 +298,20 @@ def speech_to_text(audio_data: bytes) -> str:
254
 
255
  def _process_single_chunk(audio_array: np.ndarray) -> str:
256
  candidates = []
 
257
  for code in ["yo", "ha", "ig", "en"]:
258
  model, proc = _get_asr(code)
259
  if model is None or proc is None:
260
  continue
261
  text = _run_whisper(model, proc, audio_array)
262
- if text:
263
- candidates.append((code, text))
264
-
265
- for lang_code, text in candidates:
266
- det = detect_language(text)
267
- if lang_code == det:
268
- return text
269
-
270
- if candidates:
271
- return max((t for _, t in candidates), key=lambda s: len(s or ""))
272
- return ""
273
 
274
  def _process_chunked_audio(audio_array: np.ndarray) -> str:
275
  chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
@@ -297,23 +339,65 @@ def _process_chunked_audio(audio_array: np.ndarray) -> str:
297
  language_results[code] = combined_text
298
  logger.info(f"Combined {code} result: {combined_text[:100]}...")
299
 
300
- best_result = ""
301
- best_confidence = 0
302
 
303
- for lang_code, text in language_results.items():
304
- detected_lang = detect_language(text)
305
- confidence = len(text.split())
306
-
307
- logger.info(f"Language {lang_code}: detected as {detected_lang}, confidence: {confidence}")
308
-
309
- if detected_lang == lang_code:
310
- confidence *= 2
311
-
312
- if confidence > best_confidence:
313
- best_confidence = confidence
314
- best_result = text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- return best_result if best_result else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
 
319
  def get_ai_response(text: str) -> str:
@@ -491,7 +575,29 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
491
  return {"question": text, "answer": final_text}
492
 
493
  @app.post("/speak")
494
- async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  if not audio_file.content_type.startswith('audio/'):
496
  raise HTTPException(status_code=400, detail="File must be an audio file")
497
  audio_data = await audio_file.read()
 
241
  logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
242
  return chunks
243
 
244
+ def speech_to_text_with_language(audio_data: bytes, language: str) -> str:
245
+ audio_array = preprocess_audio_ffmpeg(audio_data)
246
+
247
+ audio_duration = len(audio_array) / 16000
248
+ logger.info(f"Audio duration: {audio_duration:.2f} seconds, processing with {language} model")
249
+
250
+ model, proc = _get_asr(language)
251
+ if model is None or proc is None:
252
+ logger.error(f"Failed to load {language} ASR model")
253
+ return ""
254
+
255
+ if audio_duration <= 15:
256
+ return _process_single_chunk_with_language(audio_array, model, proc, language)
257
+ else:
258
+ return _process_chunked_audio_with_language(audio_array, model, proc, language)
259
+
260
+ def _process_single_chunk_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
261
+ text = _run_whisper(model, proc, audio_array)
262
+ if text and text.strip():
263
+ logger.info(f"Transcription ({language}): {text[:100]}...")
264
+ return text.strip()
265
+ return ""
266
+
267
+ def _process_chunked_audio_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
268
+ chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
269
+
270
+ chunk_texts = []
271
+ for i, chunk in enumerate(chunks):
272
+ try:
273
+ text = _run_whisper(model, proc, chunk)
274
+ if text and text.strip():
275
+ chunk_texts.append(text.strip())
276
+ logger.info(f"Chunk {i+1}/{len(chunks)} ({language}): {text[:50]}...")
277
+ except Exception as e:
278
+ logger.warning(f"Failed to process chunk {i+1} with {language}: {e}")
279
+ continue
280
+
281
+ if chunk_texts:
282
+ combined_text = " ".join(chunk_texts)
283
+ logger.info(f"Combined {language} result: {combined_text[:100]}...")
284
+ return combined_text
285
+
286
+ return ""
287
+
288
  def speech_to_text(audio_data: bytes) -> str:
289
  audio_array = preprocess_audio_ffmpeg(audio_data)
290
 
 
298
 
299
  def _process_single_chunk(audio_array: np.ndarray) -> str:
300
  candidates = []
301
+
302
  for code in ["yo", "ha", "ig", "en"]:
303
  model, proc = _get_asr(code)
304
  if model is None or proc is None:
305
  continue
306
  text = _run_whisper(model, proc, audio_array)
307
+ if text and text.strip():
308
+ candidates.append((code, text.strip()))
309
+
310
+ if not candidates:
311
+ return ""
312
+
313
+ best_transcription = _select_best_transcription(candidates)
314
+ return best_transcription
 
 
 
315
 
316
  def _process_chunked_audio(audio_array: np.ndarray) -> str:
317
  chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
 
339
  language_results[code] = combined_text
340
  logger.info(f"Combined {code} result: {combined_text[:100]}...")
341
 
342
+ if not language_results:
343
+ return ""
344
 
345
+ best_transcription = _select_best_transcription(list(language_results.items()))
346
+ return best_transcription
347
+
348
+ def _select_best_transcription(candidates: list) -> str:
349
+ if not candidates:
350
+ return ""
351
+
352
+ if len(candidates) == 1:
353
+ return candidates[0][1]
354
+
355
+ scored_candidates = []
356
+
357
+ for lang_code, text in candidates:
358
+ score = _score_transcription_quality(text, lang_code)
359
+ scored_candidates.append((score, lang_code, text))
360
+ logger.info(f"Transcription quality score for {lang_code}: {score:.2f} - '{text[:50]}...'")
361
+
362
+ scored_candidates.sort(key=lambda x: x[0], reverse=True)
363
+ best_score, best_lang, best_text = scored_candidates[0]
364
+
365
+ logger.info(f"Selected best transcription: {best_lang} (score: {best_score:.2f}) - '{best_text[:100]}...'")
366
+ return best_text
367
+
368
+ def _score_transcription_quality(text: str, lang_code: str) -> float:
369
+ if not text or not text.strip():
370
+ return 0.0
371
 
372
+ text = text.strip()
373
+ score = 0.0
374
+
375
+ word_count = len(text.split())
376
+ if word_count == 0:
377
+ return 0.0
378
+
379
+ score += min(word_count * 0.5, 10.0)
380
+
381
+ char_count = len(text)
382
+ score += min(char_count * 0.1, 5.0)
383
+
384
+ if lang_code == "yo":
385
+ yoruba_chars = sum(1 for c in text if c in "ẹọṣgb")
386
+ score += yoruba_chars * 2.0
387
+ elif lang_code == "ig":
388
+ igbo_chars = sum(1 for c in text if c in "ụọị")
389
+ score += igbo_chars * 2.0
390
+ elif lang_code == "ha":
391
+ hausa_chars = sum(1 for c in text if c in "ts")
392
+ score += hausa_chars * 1.5
393
+
394
+ punctuation_penalty = text.count('?') + text.count('!') + text.count('.')
395
+ score -= punctuation_penalty * 0.5
396
+
397
+ repeated_chars = sum(1 for i in range(len(text)-2) if text[i] == text[i+1] == text[i+2])
398
+ score -= repeated_chars * 1.0
399
+
400
+ return max(score, 0.0)
401
 
402
 
403
  def get_ai_response(text: str) -> str:
 
575
  return {"question": text, "answer": final_text}
576
 
577
  @app.post("/speak")
578
+ async def speak_to_ai(audio_file: UploadFile = File(...), language: str = Form(...), speak: bool = True):
579
+ if not audio_file.content_type.startswith('audio/'):
580
+ raise HTTPException(status_code=400, detail="File must be an audio file")
581
+
582
+ if language not in ["yo", "ha", "ig", "en"]:
583
+ raise HTTPException(status_code=400, detail="Language must be one of: yo (Yoruba), ha (Hausa), ig (Igbo), en (English)")
584
+
585
+ audio_data = await audio_file.read()
586
+ transcription = speech_to_text_with_language(audio_data, language)
587
+ ai_response = get_ai_response(transcription)
588
+
589
+ if speak:
590
+ output_path = text_to_speech_file(ai_response)
591
+ lang = detect_language(ai_response)
592
+ if lang == "ig":
593
+ return FileResponse(output_path, media_type="text/plain", filename="response.txt")
594
+ else:
595
+ return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
596
+
597
+ return {"transcription": transcription, "ai_response": ai_response, "language": language}
598
+
599
+ @app.post("/speak-auto")
600
+ async def speak_to_ai_auto(audio_file: UploadFile = File(...), speak: bool = True):
601
  if not audio_file.content_type.startswith('audio/'):
602
  raise HTTPException(status_code=400, detail="File must be an audio file")
603
  audio_data = await audio_file.read()