Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 12

Commit

22c3401

1 Parent(s): 35e5ec8

finish

Browse files

Files changed (1) hide show

app.py +133 -27

app.py CHANGED Viewed

@@ -241,6 +241,50 @@ def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: fl
     logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
     return chunks
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
@@ -254,22 +298,20 @@ def speech_to_text(audio_data: bytes) -> str:
 def _process_single_chunk(audio_array: np.ndarray) -> str:
     candidates = []
     for code in ["yo", "ha", "ig", "en"]:
         model, proc = _get_asr(code)
         if model is None or proc is None:
             continue
         text = _run_whisper(model, proc, audio_array)
-        if text:
-            candidates.append((code, text))
-    for lang_code, text in candidates:
-        det = detect_language(text)
-        if lang_code == det:
-            return text
-    if candidates:
-        return max((t for _, t in candidates), key=lambda s: len(s or ""))
-    return ""
 def _process_chunked_audio(audio_array: np.ndarray) -> str:
     chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
@@ -297,23 +339,65 @@ def _process_chunked_audio(audio_array: np.ndarray) -> str:
             language_results[code] = combined_text
             logger.info(f"Combined {code} result: {combined_text[:100]}...")
-    best_result = ""
-    best_confidence = 0
-    for lang_code, text in language_results.items():
-        detected_lang = detect_language(text)
-        confidence = len(text.split())
-        logger.info(f"Language {lang_code}: detected as {detected_lang}, confidence: {confidence}")
-        if detected_lang == lang_code:
-            confidence *= 2
-        if confidence > best_confidence:
-            best_confidence = confidence
-            best_result = text
-    return best_result if best_result else ""
 def get_ai_response(text: str) -> str:
@@ -491,7 +575,29 @@ async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
     return {"question": text, "answer": final_text}
 @app.post("/speak")
-async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
     if not audio_file.content_type.startswith('audio/'):
         raise HTTPException(status_code=400, detail="File must be an audio file")
     audio_data = await audio_file.read()

     logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
     return chunks
+def speech_to_text_with_language(audio_data: bytes, language: str) -> str:
+    audio_array = preprocess_audio_ffmpeg(audio_data)
+    audio_duration = len(audio_array) / 16000
+    logger.info(f"Audio duration: {audio_duration:.2f} seconds, processing with {language} model")
+    model, proc = _get_asr(language)
+    if model is None or proc is None:
+        logger.error(f"Failed to load {language} ASR model")
+        return ""
+    if audio_duration <= 15:
+        return _process_single_chunk_with_language(audio_array, model, proc, language)
+    else:
+        return _process_chunked_audio_with_language(audio_array, model, proc, language)
+def _process_single_chunk_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
+    text = _run_whisper(model, proc, audio_array)
+    if text and text.strip():
+        logger.info(f"Transcription ({language}): {text[:100]}...")
+        return text.strip()
+    return ""
+def _process_chunked_audio_with_language(audio_array: np.ndarray, model, proc, language: str) -> str:
+    chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
+    chunk_texts = []
+    for i, chunk in enumerate(chunks):
+        try:
+            text = _run_whisper(model, proc, chunk)
+            if text and text.strip():
+                chunk_texts.append(text.strip())
+                logger.info(f"Chunk {i+1}/{len(chunks)} ({language}): {text[:50]}...")
+        except Exception as e:
+            logger.warning(f"Failed to process chunk {i+1} with {language}: {e}")
+            continue
+    if chunk_texts:
+        combined_text = " ".join(chunk_texts)
+        logger.info(f"Combined {language} result: {combined_text[:100]}...")
+        return combined_text
+    return ""
 def speech_to_text(audio_data: bytes) -> str:
     audio_array = preprocess_audio_ffmpeg(audio_data)
 def _process_single_chunk(audio_array: np.ndarray) -> str:
     candidates = []
     for code in ["yo", "ha", "ig", "en"]:
         model, proc = _get_asr(code)
         if model is None or proc is None:
             continue
         text = _run_whisper(model, proc, audio_array)
+        if text and text.strip():
+            candidates.append((code, text.strip()))
+    if not candidates:
+        return ""
+    best_transcription = _select_best_transcription(candidates)
+    return best_transcription
 def _process_chunked_audio(audio_array: np.ndarray) -> str:
     chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
             language_results[code] = combined_text
             logger.info(f"Combined {code} result: {combined_text[:100]}...")
+    if not language_results:
+        return ""
+    best_transcription = _select_best_transcription(list(language_results.items()))
+    return best_transcription
+def _select_best_transcription(candidates: list) -> str:
+    if not candidates:
+        return ""
+    if len(candidates) == 1:
+        return candidates[0][1]
+    scored_candidates = []
+    for lang_code, text in candidates:
+        score = _score_transcription_quality(text, lang_code)
+        scored_candidates.append((score, lang_code, text))
+        logger.info(f"Transcription quality score for {lang_code}: {score:.2f} - '{text[:50]}...'")
+    scored_candidates.sort(key=lambda x: x[0], reverse=True)
+    best_score, best_lang, best_text = scored_candidates[0]
+    logger.info(f"Selected best transcription: {best_lang} (score: {best_score:.2f}) - '{best_text[:100]}...'")
+    return best_text
+def _score_transcription_quality(text: str, lang_code: str) -> float:
+    if not text or not text.strip():
+        return 0.0
+    text = text.strip()
+    score = 0.0
+    word_count = len(text.split())
+    if word_count == 0:
+        return 0.0
+    score += min(word_count * 0.5, 10.0)
+    char_count = len(text)
+    score += min(char_count * 0.1, 5.0)
+    if lang_code == "yo":
+        yoruba_chars = sum(1 for c in text if c in "ẹọṣgb")
+        score += yoruba_chars * 2.0
+    elif lang_code == "ig":
+        igbo_chars = sum(1 for c in text if c in "ụọị")
+        score += igbo_chars * 2.0
+    elif lang_code == "ha":
+        hausa_chars = sum(1 for c in text if c in "ts")
+        score += hausa_chars * 1.5
+    punctuation_penalty = text.count('?') + text.count('!') + text.count('.')
+    score -= punctuation_penalty * 0.5
+    repeated_chars = sum(1 for i in range(len(text)-2) if text[i] == text[i+1] == text[i+2])
+    score -= repeated_chars * 1.0
+    return max(score, 0.0)
 def get_ai_response(text: str) -> str:
     return {"question": text, "answer": final_text}
 @app.post("/speak")
+async def speak_to_ai(audio_file: UploadFile = File(...), language: str = Form(...), speak: bool = True):
+    if not audio_file.content_type.startswith('audio/'):
+        raise HTTPException(status_code=400, detail="File must be an audio file")
+    if language not in ["yo", "ha", "ig", "en"]:
+        raise HTTPException(status_code=400, detail="Language must be one of: yo (Yoruba), ha (Hausa), ig (Igbo), en (English)")
+    audio_data = await audio_file.read()
+    transcription = speech_to_text_with_language(audio_data, language)
+    ai_response = get_ai_response(transcription)
+    if speak:
+        output_path = text_to_speech_file(ai_response)
+        lang = detect_language(ai_response)
+        if lang == "ig":
+            return FileResponse(output_path, media_type="text/plain", filename="response.txt")
+        else:
+            return FileResponse(output_path, media_type="audio/wav", filename="response.wav")
+    return {"transcription": transcription, "ai_response": ai_response, "language": language}
+@app.post("/speak-auto")
+async def speak_to_ai_auto(audio_file: UploadFile = File(...), speak: bool = True):
     if not audio_file.content_type.startswith('audio/'):
         raise HTTPException(status_code=400, detail="File must be an audio file")
     audio_data = await audio_file.read()