Spaces:
Sleeping
Sleeping
finish
Browse files
app.py
CHANGED
|
@@ -241,6 +241,50 @@ def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: fl
|
|
| 241 |
logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
|
| 242 |
return chunks
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
def speech_to_text(audio_data: bytes) -> str:
|
| 245 |
audio_array = preprocess_audio_ffmpeg(audio_data)
|
| 246 |
|
|
@@ -254,22 +298,20 @@ def speech_to_text(audio_data: bytes) -> str:
|
|
| 254 |
|
| 255 |
def _process_single_chunk(audio_array: np.ndarray) -> str:
|
| 256 |
candidates = []
|
|
|
|
| 257 |
for code in ["yo", "ha", "ig", "en"]:
|
| 258 |
model, proc = _get_asr(code)
|
| 259 |
if model is None or proc is None:
|
| 260 |
continue
|
| 261 |
text = _run_whisper(model, proc, audio_array)
|
| 262 |
-
if text:
|
| 263 |
-
candidates.append((code, text))
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
if candidates:
|
| 271 |
-
return max((t for _, t in candidates), key=lambda s: len(s or ""))
|
| 272 |
-
return ""
|
| 273 |
|
| 274 |
def _process_chunked_audio(audio_array: np.ndarray) -> str:
|
| 275 |
chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
|
|
@@ -297,23 +339,65 @@ def _process_chunked_audio(audio_array: np.ndarray) -> str:
|
|
| 297 |
language_results[code] = combined_text
|
| 298 |
logger.info(f"Combined {code} result: {combined_text[:100]}...")
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
def get_ai_response(text: str) -> str:
|
|
@@ -491,7 +575,29 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|
|
| 491 |
return {"question": text, "answer": final_text}
|
| 492 |
|
| 493 |
@app.post("/speak")
|
| 494 |
-
async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
if not audio_file.content_type.startswith('audio/'):
|
| 496 |
raise HTTPException(status_code=400, detail="File must be an audio file")
|
| 497 |
audio_data = await audio_file.read()
|
|
|
|
| 241 |
logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
|
| 242 |
return chunks
|
| 243 |
|
| 244 |
+
def speech_to_text_with_language(audio_data: bytes, language: str) -> str:
|
| 245 |
+
audio_array = preprocess_audio_ffmpeg(audio_data)
|
| 246 |
+
|
| 247 |
+
audio_duration = len(audio_array) / 16000
|
| 248 |
+
logger.info(f"Audio duration: {audio_duration:.2f} seconds, processing with {language} model")
|
| 249 |
+
|
| 250 |
+
model, proc = _get_asr(language)
|
| 251 |
+
if model is None or proc is None:
|
| 252 |
+
logger.error(f"Failed to load {language} ASR model")
|
| 253 |
+
return ""
|
| 254 |
+
|
| 255 |
+
if audio_duration <= 15:
|
| 256 |
+
return _process_single_chunk_with_language(audio_array, model, proc, language)
|
| 257 |
+
else:
|
| 258 |
+
return _process_chunked_audio_with_language(audio_array, model, proc, language)
|
| 259 |
+
|
| 260 |
+
def _process_single_chunk_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
|
| 261 |
+
text = _run_whisper(model, proc, audio_array)
|
| 262 |
+
if text and text.strip():
|
| 263 |
+
logger.info(f"Transcription ({language}): {text[:100]}...")
|
| 264 |
+
return text.strip()
|
| 265 |
+
return ""
|
| 266 |
+
|
| 267 |
+
def _process_chunked_audio_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
|
| 268 |
+
chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
|
| 269 |
+
|
| 270 |
+
chunk_texts = []
|
| 271 |
+
for i, chunk in enumerate(chunks):
|
| 272 |
+
try:
|
| 273 |
+
text = _run_whisper(model, proc, chunk)
|
| 274 |
+
if text and text.strip():
|
| 275 |
+
chunk_texts.append(text.strip())
|
| 276 |
+
logger.info(f"Chunk {i+1}/{len(chunks)} ({language}): {text[:50]}...")
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.warning(f"Failed to process chunk {i+1} with {language}: {e}")
|
| 279 |
+
continue
|
| 280 |
+
|
| 281 |
+
if chunk_texts:
|
| 282 |
+
combined_text = " ".join(chunk_texts)
|
| 283 |
+
logger.info(f"Combined {language} result: {combined_text[:100]}...")
|
| 284 |
+
return combined_text
|
| 285 |
+
|
| 286 |
+
return ""
|
| 287 |
+
|
| 288 |
def speech_to_text(audio_data: bytes) -> str:
|
| 289 |
audio_array = preprocess_audio_ffmpeg(audio_data)
|
| 290 |
|
|
|
|
| 298 |
|
| 299 |
def _process_single_chunk(audio_array: np.ndarray) -> str:
|
| 300 |
candidates = []
|
| 301 |
+
|
| 302 |
for code in ["yo", "ha", "ig", "en"]:
|
| 303 |
model, proc = _get_asr(code)
|
| 304 |
if model is None or proc is None:
|
| 305 |
continue
|
| 306 |
text = _run_whisper(model, proc, audio_array)
|
| 307 |
+
if text and text.strip():
|
| 308 |
+
candidates.append((code, text.strip()))
|
| 309 |
+
|
| 310 |
+
if not candidates:
|
| 311 |
+
return ""
|
| 312 |
+
|
| 313 |
+
best_transcription = _select_best_transcription(candidates)
|
| 314 |
+
return best_transcription
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
def _process_chunked_audio(audio_array: np.ndarray) -> str:
|
| 317 |
chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
|
|
|
|
| 339 |
language_results[code] = combined_text
|
| 340 |
logger.info(f"Combined {code} result: {combined_text[:100]}...")
|
| 341 |
|
| 342 |
+
if not language_results:
|
| 343 |
+
return ""
|
| 344 |
|
| 345 |
+
best_transcription = _select_best_transcription(list(language_results.items()))
|
| 346 |
+
return best_transcription
|
| 347 |
+
|
| 348 |
+
def _select_best_transcription(candidates: list) -> str:
|
| 349 |
+
if not candidates:
|
| 350 |
+
return ""
|
| 351 |
+
|
| 352 |
+
if len(candidates) == 1:
|
| 353 |
+
return candidates[0][1]
|
| 354 |
+
|
| 355 |
+
scored_candidates = []
|
| 356 |
+
|
| 357 |
+
for lang_code, text in candidates:
|
| 358 |
+
score = _score_transcription_quality(text, lang_code)
|
| 359 |
+
scored_candidates.append((score, lang_code, text))
|
| 360 |
+
logger.info(f"Transcription quality score for {lang_code}: {score:.2f} - '{text[:50]}...'")
|
| 361 |
+
|
| 362 |
+
scored_candidates.sort(key=lambda x: x[0], reverse=True)
|
| 363 |
+
best_score, best_lang, best_text = scored_candidates[0]
|
| 364 |
+
|
| 365 |
+
logger.info(f"Selected best transcription: {best_lang} (score: {best_score:.2f}) - '{best_text[:100]}...'")
|
| 366 |
+
return best_text
|
| 367 |
+
|
| 368 |
+
def _score_transcription_quality(text: str, lang_code: str) -> float:
|
| 369 |
+
if not text or not text.strip():
|
| 370 |
+
return 0.0
|
| 371 |
|
| 372 |
+
text = text.strip()
|
| 373 |
+
score = 0.0
|
| 374 |
+
|
| 375 |
+
word_count = len(text.split())
|
| 376 |
+
if word_count == 0:
|
| 377 |
+
return 0.0
|
| 378 |
+
|
| 379 |
+
score += min(word_count * 0.5, 10.0)
|
| 380 |
+
|
| 381 |
+
char_count = len(text)
|
| 382 |
+
score += min(char_count * 0.1, 5.0)
|
| 383 |
+
|
| 384 |
+
if lang_code == "yo":
|
| 385 |
+
yoruba_chars = sum(1 for c in text if c in "ẹọṣgb")
|
| 386 |
+
score += yoruba_chars * 2.0
|
| 387 |
+
elif lang_code == "ig":
|
| 388 |
+
igbo_chars = sum(1 for c in text if c in "ụọị")
|
| 389 |
+
score += igbo_chars * 2.0
|
| 390 |
+
elif lang_code == "ha":
|
| 391 |
+
hausa_chars = sum(1 for c in text if c in "ts")
|
| 392 |
+
score += hausa_chars * 1.5
|
| 393 |
+
|
| 394 |
+
punctuation_penalty = text.count('?') + text.count('!') + text.count('.')
|
| 395 |
+
score -= punctuation_penalty * 0.5
|
| 396 |
+
|
| 397 |
+
repeated_chars = sum(1 for i in range(len(text)-2) if text[i] == text[i+1] == text[i+2])
|
| 398 |
+
score -= repeated_chars * 1.0
|
| 399 |
+
|
| 400 |
+
return max(score, 0.0)
|
| 401 |
|
| 402 |
|
| 403 |
def get_ai_response(text: str) -> str:
|
|
|
|
| 575 |
return {"question": text, "answer": final_text}
|
| 576 |
|
| 577 |
@app.post("/speak")
|
| 578 |
+
async def speak_to_ai(audio_file: UploadFile = File(...), language: str = Form(...), speak: bool = True):
|
| 579 |
+
if not audio_file.content_type.startswith('audio/'):
|
| 580 |
+
raise HTTPException(status_code=400, detail="File must be an audio file")
|
| 581 |
+
|
| 582 |
+
if language not in ["yo", "ha", "ig", "en"]:
|
| 583 |
+
raise HTTPException(status_code=400, detail="Language must be one of: yo (Yoruba), ha (Hausa), ig (Igbo), en (English)")
|
| 584 |
+
|
| 585 |
+
audio_data = await audio_file.read()
|
| 586 |
+
transcription = speech_to_text_with_language(audio_data, language)
|
| 587 |
+
ai_response = get_ai_response(transcription)
|
| 588 |
+
|
| 589 |
+
if speak:
|
| 590 |
+
output_path = text_to_speech_file(ai_response)
|
| 591 |
+
lang = detect_language(ai_response)
|
| 592 |
+
if lang == "ig":
|
| 593 |
+
return FileResponse(output_path, media_type="text/plain", filename="response.txt")
|
| 594 |
+
else:
|
| 595 |
+
return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
|
| 596 |
+
|
| 597 |
+
return {"transcription": transcription, "ai_response": ai_response, "language": language}
|
| 598 |
+
|
| 599 |
+
@app.post("/speak-auto")
|
| 600 |
+
async def speak_to_ai_auto(audio_file: UploadFile = File(...), speak: bool = True):
|
| 601 |
if not audio_file.content_type.startswith('audio/'):
|
| 602 |
raise HTTPException(status_code=400, detail="File must be an audio file")
|
| 603 |
audio_data = await audio_file.read()
|