import os import torch from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware # ----------------------------------- # Environment setup # ----------------------------------- os.environ["HF_HOME"] = "/tmp" os.environ["TRANSFORMERS_CACHE"] = "/tmp" device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float16" if device == "cuda" else "int8" print(f"🚀 Loading Faster-Whisper model on {device}...") # Import and load faster-whisper from faster_whisper import WhisperModel # Load model - options: tiny, base, small, medium, large-v2, large-v3 model = WhisperModel( "base", # Change to "small" or "medium" for better Quran accuracy device=device, compute_type=compute_type, download_root="/tmp" ) print("✅ Faster-Whisper model loaded with word-level timestamp support") # ----------------------------------- # FastAPI app setup # ----------------------------------- app = FastAPI(title="Qur'an Whisper Arabic Transcriber with Word-Level Timestamps") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.post("/transcribe") async def transcribe(audio: UploadFile = File(...)): """ Transcribe Arabic audio with word-level timestamps. Transcribes the entire audio at once and returns timestamp for each word. """ temp_path = None try: temp_path = f"/tmp/{audio.filename}" with open(temp_path, "wb") as f: f.write(await audio.read()) print(f"🎧 Transcribing: {temp_path}") # Transcribe with word-level timestamps segments, info = model.transcribe( temp_path, language="ar", # Arabic task="transcribe", word_timestamps=True, # Enable word-level timestamps vad_filter=True, # Voice activity detection ) # Convert generator to list and extract data segments_list = list(segments) # Extract full text and word-level timestamps full_text = [] words = [] for segment in segments_list: full_text.append(segment.text) # Extract word-level timestamps for word in segment.words: words.append({ "word": word.word.strip(), "start": round(word.start, 2), "end": round(word.end, 2), "probability": round(word.probability, 3) }) output = { "text": " ".join(full_text).strip(), "words": words, "language": info.language, "language_probability": round(info.language_probability, 3), "duration": round(info.duration, 2), "model": "faster-whisper-base", } print(f"✅ Transcription complete: {len(words)} words, duration: {info.duration}s") return JSONResponse(output) except Exception as e: print("❌ Error:", e) import traceback traceback.print_exc() return JSONResponse({"error": str(e)}, status_code=500) finally: # Clean up temp file if temp_path and os.path.exists(temp_path): try: os.remove(temp_path) except: pass @app.get("/health") def health(): return { "status": "healthy", "model": "faster-whisper-base", "device": device, "timestamp_support": "word-level", "language": "arabic", } @app.get("/") def index(): return {"message": "🕌 Qur'an Whisper Transcription API with Word-Level Timestamps is running!"} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)