Spaces:
Sleeping
Sleeping
push
Browse files
app.py
CHANGED
|
@@ -185,7 +185,6 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
|
|
| 185 |
raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
|
| 186 |
|
| 187 |
def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
|
| 188 |
-
# Check for silence or very low amplitude
|
| 189 |
rms = np.sqrt(np.mean(audio_array**2))
|
| 190 |
if rms < 0.001:
|
| 191 |
logger.warning("Audio appears to be very quiet or silent")
|
|
@@ -560,6 +559,43 @@ async def status():
|
|
| 560 |
"message": "Using lightweight keyword-based language detection - no heavy models required"
|
| 561 |
}
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
@app.post("/chat")
|
| 564 |
async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|
| 565 |
if not text.strip():
|
|
@@ -587,11 +623,10 @@ async def speak_to_ai(audio_file: UploadFile = File(...), language: str = Form(.
|
|
| 587 |
ai_response = get_ai_response(transcription)
|
| 588 |
|
| 589 |
if speak:
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
if lang == "ig":
|
| 593 |
-
return FileResponse(output_path, media_type="text/plain", filename="response.txt")
|
| 594 |
else:
|
|
|
|
| 595 |
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
| 596 |
|
| 597 |
return {"transcription": transcription, "ai_response": ai_response, "language": language}
|
|
@@ -604,11 +639,11 @@ async def speak_to_ai_auto(audio_file: UploadFile = File(...), speak: bool = Tru
|
|
| 604 |
transcription = speech_to_text(audio_data)
|
| 605 |
ai_response = get_ai_response(transcription)
|
| 606 |
if speak:
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
return FileResponse(output_path, media_type="text/plain", filename="response.txt")
|
| 611 |
else:
|
|
|
|
| 612 |
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
| 613 |
return {"transcription": transcription, "ai_response": ai_response}
|
| 614 |
|
|
|
|
| 185 |
raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
|
| 186 |
|
| 187 |
def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
|
|
|
|
| 188 |
rms = np.sqrt(np.mean(audio_array**2))
|
| 189 |
if rms < 0.001:
|
| 190 |
logger.warning("Audio appears to be very quiet or silent")
|
|
|
|
| 559 |
"message": "Using lightweight keyword-based language detection - no heavy models required"
|
| 560 |
}
|
| 561 |
|
| 562 |
+
@app.get("/languages")
|
| 563 |
+
async def get_languages():
|
| 564 |
+
return {
|
| 565 |
+
"supported_languages": {
|
| 566 |
+
"yo": {
|
| 567 |
+
"name": "Yoruba",
|
| 568 |
+
"code": "yo",
|
| 569 |
+
"tts_available": True,
|
| 570 |
+
"asr_model": "NCAIR1/Yoruba-ASR"
|
| 571 |
+
},
|
| 572 |
+
"ha": {
|
| 573 |
+
"name": "Hausa",
|
| 574 |
+
"code": "ha",
|
| 575 |
+
"tts_available": True,
|
| 576 |
+
"asr_model": "NCAIR1/Hausa-ASR"
|
| 577 |
+
},
|
| 578 |
+
"ig": {
|
| 579 |
+
"name": "Igbo",
|
| 580 |
+
"code": "ig",
|
| 581 |
+
"tts_available": False,
|
| 582 |
+
"asr_model": "NCAIR1/Igbo-ASR",
|
| 583 |
+
"note": "Text response only - TTS not available"
|
| 584 |
+
},
|
| 585 |
+
"en": {
|
| 586 |
+
"name": "English",
|
| 587 |
+
"code": "en",
|
| 588 |
+
"tts_available": True,
|
| 589 |
+
"asr_model": "NCAIR1/NigerianAccentedEnglish"
|
| 590 |
+
}
|
| 591 |
+
},
|
| 592 |
+
"usage": {
|
| 593 |
+
"specify_language": "Use /speak endpoint with language parameter for best accuracy",
|
| 594 |
+
"auto_detection": "Use /speak-auto endpoint when language is unknown",
|
| 595 |
+
"igbo_response": "Igbo responses return text only (no audio TTS available)"
|
| 596 |
+
}
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
@app.post("/chat")
|
| 600 |
async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|
| 601 |
if not text.strip():
|
|
|
|
| 623 |
ai_response = get_ai_response(transcription)
|
| 624 |
|
| 625 |
if speak:
|
| 626 |
+
if language == "ig":
|
| 627 |
+
return {"transcription": transcription, "ai_response": ai_response, "language": language, "response_type": "text", "note": "Igbo TTS not available - returning text response"}
|
|
|
|
|
|
|
| 628 |
else:
|
| 629 |
+
output_path = text_to_speech_file(ai_response)
|
| 630 |
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
| 631 |
|
| 632 |
return {"transcription": transcription, "ai_response": ai_response, "language": language}
|
|
|
|
| 639 |
transcription = speech_to_text(audio_data)
|
| 640 |
ai_response = get_ai_response(transcription)
|
| 641 |
if speak:
|
| 642 |
+
detected_lang = detect_language(ai_response)
|
| 643 |
+
if detected_lang == "ig":
|
| 644 |
+
return {"transcription": transcription, "ai_response": ai_response, "detected_language": detected_lang, "response_type": "text", "note": "Igbo TTS not available - returning text response"}
|
|
|
|
| 645 |
else:
|
| 646 |
+
output_path = text_to_speech_file(ai_response)
|
| 647 |
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
| 648 |
return {"transcription": transcription, "ai_response": ai_response}
|
| 649 |
|