nexusbert commited on
Commit
634bf12
·
1 Parent(s): 22c3401
Files changed (1) hide show
  1. app.py +44 -9
app.py CHANGED
@@ -185,7 +185,6 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
185
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
186
 
187
  def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
188
- # Check for silence or very low amplitude
189
  rms = np.sqrt(np.mean(audio_array**2))
190
  if rms < 0.001:
191
  logger.warning("Audio appears to be very quiet or silent")
@@ -560,6 +559,43 @@ async def status():
560
  "message": "Using lightweight keyword-based language detection - no heavy models required"
561
  }
562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  @app.post("/chat")
564
  async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
565
  if not text.strip():
@@ -587,11 +623,10 @@ async def speak_to_ai(audio_file: UploadFile = File(...), language: str = Form(.
587
  ai_response = get_ai_response(transcription)
588
 
589
  if speak:
590
- output_path = text_to_speech_file(ai_response)
591
- lang = detect_language(ai_response)
592
- if lang == "ig":
593
- return FileResponse(output_path, media_type="text/plain", filename="response.txt")
594
  else:
 
595
  return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
596
 
597
  return {"transcription": transcription, "ai_response": ai_response, "language": language}
@@ -604,11 +639,11 @@ async def speak_to_ai_auto(audio_file: UploadFile = File(...), speak: bool = Tru
604
  transcription = speech_to_text(audio_data)
605
  ai_response = get_ai_response(transcription)
606
  if speak:
607
- output_path = text_to_speech_file(ai_response)
608
- lang = detect_language(ai_response)
609
- if lang == "ig":
610
- return FileResponse(output_path, media_type="text/plain", filename="response.txt")
611
  else:
 
612
  return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
613
  return {"transcription": transcription, "ai_response": ai_response}
614
 
 
185
  raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
186
 
187
  def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
 
188
  rms = np.sqrt(np.mean(audio_array**2))
189
  if rms < 0.001:
190
  logger.warning("Audio appears to be very quiet or silent")
 
559
  "message": "Using lightweight keyword-based language detection - no heavy models required"
560
  }
561
 
562
+ @app.get("/languages")
563
+ async def get_languages():
564
+ return {
565
+ "supported_languages": {
566
+ "yo": {
567
+ "name": "Yoruba",
568
+ "code": "yo",
569
+ "tts_available": True,
570
+ "asr_model": "NCAIR1/Yoruba-ASR"
571
+ },
572
+ "ha": {
573
+ "name": "Hausa",
574
+ "code": "ha",
575
+ "tts_available": True,
576
+ "asr_model": "NCAIR1/Hausa-ASR"
577
+ },
578
+ "ig": {
579
+ "name": "Igbo",
580
+ "code": "ig",
581
+ "tts_available": False,
582
+ "asr_model": "NCAIR1/Igbo-ASR",
583
+ "note": "Text response only - TTS not available"
584
+ },
585
+ "en": {
586
+ "name": "English",
587
+ "code": "en",
588
+ "tts_available": True,
589
+ "asr_model": "NCAIR1/NigerianAccentedEnglish"
590
+ }
591
+ },
592
+ "usage": {
593
+ "specify_language": "Use /speak endpoint with language parameter for best accuracy",
594
+ "auto_detection": "Use /speak-auto endpoint when language is unknown",
595
+ "igbo_response": "Igbo responses return text only (no audio TTS available)"
596
+ }
597
+ }
598
+
599
  @app.post("/chat")
600
  async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
601
  if not text.strip():
 
623
  ai_response = get_ai_response(transcription)
624
 
625
  if speak:
626
+ if language == "ig":
627
+ return {"transcription": transcription, "ai_response": ai_response, "language": language, "response_type": "text", "note": "Igbo TTS not available - returning text response"}
 
 
628
  else:
629
+ output_path = text_to_speech_file(ai_response)
630
  return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
631
 
632
  return {"transcription": transcription, "ai_response": ai_response, "language": language}
 
639
  transcription = speech_to_text(audio_data)
640
  ai_response = get_ai_response(transcription)
641
  if speak:
642
+ detected_lang = detect_language(ai_response)
643
+ if detected_lang == "ig":
644
+ return {"transcription": transcription, "ai_response": ai_response, "detected_language": detected_lang, "response_type": "text", "note": "Igbo TTS not available - returning text response"}
 
645
  else:
646
+ output_path = text_to_speech_file(ai_response)
647
  return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
648
  return {"transcription": transcription, "ai_response": ai_response}
649