| """ |
| OmniVoice FastAPI server for Linux + NVIDIA (e.g. RunPod). |
| |
| Exposes the full OmniVoice surface (voice clone, voice design, auto voice) + all |
| generation parameters from `omnivoice.OmniVoiceGenerationConfig`. |
| |
| Endpoints |
| --------- |
| - GET /health Liveness/readiness + startup error. |
| - GET /v1/models List served model. |
| - GET /v1/languages All language display names supported by the model. |
| - GET /v1/voice-design/attributes Attribute groups for the Voice Design composer. |
| - POST /v1/audio/speech Unified TTS endpoint (multipart): |
| text=<str> (required) |
| mode=clone|design|auto (default: clone if ref_audio else design if instruct else auto) |
| ref_audio=<file> (clone) |
| ref_text=<str> (clone, optional — Whisper auto-transcribes if omitted) |
| instruct=<str> (design or clone overlay) |
| language=<str> (display name, e.g. "English", "Hindi"; "Auto" for auto-detect) |
| speed=<float> (default 1.0) |
| duration=<float> (seconds; if set overrides speed) |
| num_step=<int> (default 32) |
| guidance_scale=<float> (default 2.0) |
| denoise=<bool> (default true) |
| preprocess_prompt=<bool> (default true) |
| postprocess_output=<bool> (default true) |
| - POST /v1/audio/speech/multi Multi-character story generation (JSON body). |
| Splits text on [Sn]…[/Sn] markers, runs OmniVoice once per chunk with |
| each character's mode/instruct/ref_audio (clone or design), and stitches |
| the resulting WAVs (with a configurable inter-segment silence) into a |
| single WAV response. See `MultiSpeechRequest` schema below. |
| - POST /v1/audio/speech/clone Backward-compat shim (forwards to /v1/audio/speech with mode=clone). |
| |
| Returns: 200 audio/wav (16-bit PCM mono @ 24 kHz). |
| """ |
|
|
| import asyncio |
| import base64 |
| import binascii |
| import io |
| import logging |
| import os |
| import platform |
| import re |
| import tempfile |
| import wave |
| from typing import Any |
|
|
| import numpy as np |
| import torch |
| from fastapi import FastAPI, File, Form, HTTPException, UploadFile |
| from fastapi.responses import JSONResponse, StreamingResponse |
| from pydantic import BaseModel, Field |
| from omnivoice import OmniVoice, OmniVoiceGenerationConfig |
| from omnivoice.utils.lang_map import LANG_NAMES, lang_display_name |
|
|
| logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") |
|
|
| MODEL_ID = os.getenv("OMNIVOICE_MODEL_ID", "pravinuxd/OmniVoice") |
| SAMPLE_RATE = 24000 |
|
|
|
|
| def _resolve_device_and_dtype() -> tuple[str, torch.dtype]: |
| if torch.cuda.is_available(): |
| return "cuda:0", torch.float16 |
| return "cpu", torch.float32 |
|
|
|
|
| DEVICE, DTYPE = _resolve_device_and_dtype() |
|
|
|
|
| def _enforce_cuda_if_runpod() -> None: |
| on_runpod = bool(os.getenv("RUNPOD_POD_ID")) |
| require = os.getenv("OMNIVOICE_REQUIRE_CUDA", "").strip().lower() |
| if require in ("1", "true", "yes"): |
| must_cuda = True |
| elif require in ("0", "false", "no"): |
| must_cuda = False |
| else: |
| must_cuda = on_runpod |
| if must_cuda and not torch.cuda.is_available(): |
| raise RuntimeError( |
| "CUDA is not available. This image is built for NVIDIA GPUs on Linux (RunPod). " |
| "Attach a GPU to the pod or set OMNIVOICE_REQUIRE_CUDA=0 only for debugging." |
| ) |
|
|
|
|
| _enforce_cuda_if_runpod() |
|
|
| if torch.cuda.is_available(): |
| logging.info( |
| "Using GPU: %s | CUDA %s", |
| torch.cuda.get_device_name(0), |
| torch.version.cuda, |
| ) |
| else: |
| logging.warning("Running on CPU — not recommended for production OmniVoice inference.") |
| logging.info( |
| "Platform: %s | MODEL_ID=%s | device=%s | dtype=%s", |
| platform.system(), |
| MODEL_ID, |
| DEVICE, |
| DTYPE, |
| ) |
|
|
| |
| _LANGUAGES_SORTED = sorted({lang_display_name(n) for n in LANG_NAMES}) |
|
|
| |
| _VOICE_DESIGN_ATTRIBUTES = { |
| "gender": { |
| "label": "Gender", |
| "info": "Speaker gender.", |
| "options": [ |
| {"value": "male", "label": "Male / 男"}, |
| {"value": "female", "label": "Female / 女"}, |
| ], |
| }, |
| "age": { |
| "label": "Age", |
| "info": "Approximate speaker age.", |
| "options": [ |
| {"value": "child", "label": "Child / 儿童"}, |
| {"value": "teenager", "label": "Teenager / 少年"}, |
| {"value": "young adult", "label": "Young Adult / 青年"}, |
| {"value": "middle-aged", "label": "Middle-aged / 中年"}, |
| {"value": "elderly", "label": "Elderly / 老年"}, |
| ], |
| }, |
| "pitch": { |
| "label": "Pitch", |
| "info": "Voice pitch register.", |
| "options": [ |
| {"value": "very low pitch", "label": "Very Low / 极低音调"}, |
| {"value": "low pitch", "label": "Low / 低音调"}, |
| {"value": "moderate pitch", "label": "Moderate / 中音调"}, |
| {"value": "high pitch", "label": "High / 高音调"}, |
| {"value": "very high pitch", "label": "Very High / 极高音调"}, |
| ], |
| }, |
| "style": { |
| "label": "Style", |
| "info": "Speaking style.", |
| "options": [ |
| {"value": "whisper", "label": "Whisper / 耳语"}, |
| ], |
| }, |
| "english_accent": { |
| "label": "English Accent", |
| "info": "Only effective when generating English speech.", |
| "options": [ |
| {"value": "american accent", "label": "American"}, |
| {"value": "australian accent", "label": "Australian"}, |
| {"value": "british accent", "label": "British"}, |
| {"value": "canadian accent", "label": "Canadian"}, |
| {"value": "chinese accent", "label": "Chinese"}, |
| {"value": "indian accent", "label": "Indian"}, |
| {"value": "japanese accent", "label": "Japanese"}, |
| {"value": "korean accent", "label": "Korean"}, |
| {"value": "portuguese accent", "label": "Portuguese"}, |
| {"value": "russian accent", "label": "Russian"}, |
| ], |
| }, |
| "chinese_dialect": { |
| "label": "Chinese Dialect", |
| "info": "Only effective when generating Chinese speech.", |
| "options": [ |
| {"value": "河南话", "label": "Henan / 河南话"}, |
| {"value": "陕西话", "label": "Shaanxi / 陕西话"}, |
| {"value": "四川话", "label": "Sichuan / 四川话"}, |
| {"value": "贵州话", "label": "Guizhou / 贵州话"}, |
| {"value": "云南话", "label": "Yunnan / 云南话"}, |
| {"value": "桂林话", "label": "Guilin / 桂林话"}, |
| {"value": "济南话", "label": "Jinan / 济南话"}, |
| {"value": "石家庄话", "label": "Shijiazhuang / 石家庄话"}, |
| {"value": "甘肃话", "label": "Gansu / 甘肃话"}, |
| {"value": "宁夏话", "label": "Ningxia / 宁夏话"}, |
| {"value": "青岛话", "label": "Qingdao / 青岛话"}, |
| {"value": "东北话", "label": "Northeast / 东北话"}, |
| ], |
| }, |
| } |
|
|
| app = FastAPI(title="OmniVoice Pod API", version="2") |
| model: OmniVoice | None = None |
| startup_error: str | None = None |
|
|
| |
| |
| |
| |
| try: |
| from . import dub as _dub |
| except ImportError: |
| import dub as _dub |
|
|
| app.include_router(_dub.router) |
|
|
|
|
| def _load_model() -> OmniVoice: |
| return OmniVoice.from_pretrained( |
| MODEL_ID, |
| device_map=DEVICE, |
| dtype=DTYPE, |
| load_asr=True, |
| ) |
|
|
|
|
| @app.on_event("startup") |
| def startup_event() -> None: |
| global model, startup_error |
| try: |
| model = _load_model() |
| startup_error = None |
| |
| |
| _dub.configure(model, OmniVoiceGenerationConfig) |
| except Exception as exc: |
| startup_error = f"{type(exc).__name__}: {exc}" |
| raise |
|
|
|
|
| def _wav_bytes(audio: np.ndarray) -> bytes: |
| clipped = np.clip(audio, -1.0, 1.0) |
| pcm16 = (clipped * 32767.0).astype(np.int16) |
| buf = io.BytesIO() |
| with wave.open(buf, "wb") as wav: |
| wav.setnchannels(1) |
| wav.setsampwidth(2) |
| wav.setframerate(SAMPLE_RATE) |
| wav.writeframes(pcm16.tobytes()) |
| return buf.getvalue() |
|
|
|
|
| def _parse_bool(val: str | None, default: bool) -> bool: |
| if val is None: |
| return default |
| return val.strip().lower() in ("1", "true", "yes", "on") |
|
|
|
|
| def _normalize_language(lang: str | None) -> str | None: |
| if not lang: |
| return None |
| cleaned = lang.strip() |
| if not cleaned or cleaned.lower() == "auto": |
| return None |
| return cleaned |
|
|
|
|
| def _resolve_mode(mode: str | None, has_ref_audio: bool, has_instruct: bool) -> str: |
| if mode: |
| m = mode.strip().lower() |
| if m in ("clone", "design", "auto"): |
| return m |
| if has_ref_audio: |
| return "clone" |
| if has_instruct: |
| return "design" |
| return "auto" |
|
|
|
|
| @app.get("/health") |
| def health() -> JSONResponse: |
| ready = model is not None and startup_error is None |
| return JSONResponse( |
| { |
| "status": "healthy" if ready else "starting", |
| "ready": ready, |
| "model_loaded": ready, |
| "model_id": MODEL_ID, |
| "device": DEVICE, |
| "startup_error": startup_error, |
| } |
| ) |
|
|
|
|
| @app.get("/v1/models") |
| def list_models() -> JSONResponse: |
| return JSONResponse( |
| { |
| "object": "list", |
| "data": [ |
| { |
| "id": "omnivoice", |
| "object": "model", |
| "owned_by": "pravinuxd", |
| "root": MODEL_ID, |
| } |
| ], |
| } |
| ) |
|
|
|
|
| @app.get("/v1/languages") |
| def list_languages() -> JSONResponse: |
| return JSONResponse({"languages": _LANGUAGES_SORTED, "count": len(_LANGUAGES_SORTED)}) |
|
|
|
|
| @app.get("/v1/voice-design/attributes") |
| def list_voice_design_attributes() -> JSONResponse: |
| return JSONResponse({"attributes": _VOICE_DESIGN_ATTRIBUTES}) |
|
|
|
|
| def _generate_audio( |
| *, |
| text: str, |
| mode: str, |
| ref_audio_path: str | None, |
| ref_text: str | None, |
| instruct: str | None, |
| language: str | None, |
| speed: float, |
| duration: float | None, |
| num_step: int, |
| guidance_scale: float, |
| denoise: bool, |
| preprocess_prompt: bool, |
| postprocess_output: bool, |
| ) -> bytes: |
| if model is None: |
| raise HTTPException(status_code=503, detail=startup_error or "Model not ready") |
|
|
| gen_config = OmniVoiceGenerationConfig( |
| num_step=num_step, |
| guidance_scale=guidance_scale, |
| denoise=denoise, |
| preprocess_prompt=preprocess_prompt, |
| postprocess_output=postprocess_output, |
| ) |
|
|
| kw: dict[str, Any] = { |
| "text": text, |
| "language": language, |
| "generation_config": gen_config, |
| } |
| if speed != 1.0: |
| kw["speed"] = speed |
| if duration is not None and duration > 0: |
| kw["duration"] = duration |
|
|
| if mode == "clone": |
| if not ref_audio_path: |
| raise HTTPException(status_code=400, detail="mode=clone requires ref_audio") |
| kw["voice_clone_prompt"] = model.create_voice_clone_prompt( |
| ref_audio=ref_audio_path, |
| ref_text=ref_text or None, |
| ) |
|
|
| if instruct and instruct.strip(): |
| kw["instruct"] = instruct.strip() |
|
|
| try: |
| generated = model.generate(**kw) |
| except HTTPException: |
| raise |
| except Exception as exc: |
| logging.exception("OmniVoice generation failed") |
| raise HTTPException( |
| status_code=500, detail=f"{type(exc).__name__}: {exc}" |
| ) from exc |
|
|
| return _wav_bytes(generated[0]) |
|
|
|
|
| @app.post("/v1/audio/speech") |
| async def synth_speech( |
| text: str = Form(...), |
| mode: str | None = Form(None), |
| ref_audio: UploadFile | None = File(None), |
| ref_text: str | None = Form(None), |
| instruct: str | None = Form(None), |
| language: str | None = Form(None), |
| speed: float = Form(1.0), |
| duration: float | None = Form(None), |
| num_step: int = Form(32), |
| guidance_scale: float = Form(2.0), |
| denoise: str | None = Form(None), |
| preprocess_prompt: str | None = Form(None), |
| postprocess_output: str | None = Form(None), |
| ) -> StreamingResponse: |
| text = (text or "").strip() |
| if not text: |
| raise HTTPException(status_code=400, detail="text is required") |
|
|
| has_ref = ref_audio is not None and (ref_audio.filename or "") |
| has_instruct = bool(instruct and instruct.strip()) |
| resolved_mode = _resolve_mode(mode, bool(has_ref), has_instruct) |
|
|
| tmp_path: str | None = None |
| try: |
| if resolved_mode == "clone": |
| if not has_ref: |
| raise HTTPException( |
| status_code=400, detail="mode=clone requires ref_audio" |
| ) |
| audio_bytes = await ref_audio.read() |
| suffix = ( |
| os.path.splitext(ref_audio.filename or "reference.wav")[1] or ".wav" |
| ) |
| with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: |
| tmp.write(audio_bytes) |
| tmp_path = tmp.name |
|
|
| |
| |
| async with _dub.GPU_LOCK: |
| wav_bytes = await asyncio.to_thread( |
| _generate_audio, |
| text=text, |
| mode=resolved_mode, |
| ref_audio_path=tmp_path, |
| ref_text=(ref_text or None), |
| instruct=(instruct or None), |
| language=_normalize_language(language), |
| speed=float(speed), |
| duration=(float(duration) if duration is not None else None), |
| num_step=int(num_step or 32), |
| guidance_scale=float(guidance_scale), |
| denoise=_parse_bool(denoise, True), |
| preprocess_prompt=_parse_bool(preprocess_prompt, True), |
| postprocess_output=_parse_bool(postprocess_output, True), |
| ) |
|
|
| headers = {"X-OmniVoice-Mode": resolved_mode} |
| return StreamingResponse( |
| io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers |
| ) |
| finally: |
| if tmp_path and os.path.exists(tmp_path): |
| os.unlink(tmp_path) |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| _SPEAKER_TAG_RE = re.compile( |
| r"\[\s*S([1-8])([^\]]*)\](.*?)\[\s*/\s*S\1\s*\]", |
| re.DOTALL, |
| ) |
|
|
| |
| |
| |
| |
| _PAUSE_TAG_RE = re.compile( |
| r"^\s*\[\s*pause\s*=\s*([0-9]*\.?[0-9]+)\s*(s|ms)?\s*\]\s*$", |
| re.IGNORECASE, |
| ) |
|
|
| |
| |
| _DURATION_ATTR_RE = re.compile( |
| r"duration\s*=\s*([0-9]*\.?[0-9]+)\s*(s|ms)?", |
| re.IGNORECASE, |
| ) |
|
|
|
|
| def _parse_pause_seconds(text: str) -> float | None: |
| """If `text` is a single `[pause=…]` directive return its length in seconds.""" |
| m = _PAUSE_TAG_RE.match(text) |
| if not m: |
| return None |
| value = float(m.group(1)) |
| unit = (m.group(2) or "s").lower() |
| return value / 1000.0 if unit == "ms" else value |
|
|
|
|
| def _parse_duration_attr(attrs: str) -> float | None: |
| """Parse `duration=Xs` / `duration=Xms` from the `[Sn …]` attribute string.""" |
| m = _DURATION_ATTR_RE.search(attrs or "") |
| if not m: |
| return None |
| value = float(m.group(1)) |
| unit = (m.group(2) or "s").lower() |
| return value / 1000.0 if unit == "ms" else value |
|
|
|
|
| class CharacterConfig(BaseModel): |
| """One character slot (S1..S8) used by /v1/audio/speech/multi.""" |
|
|
| slot: int = Field(..., ge=1, le=8) |
| name: str | None = None |
| mode: str = Field(..., pattern="^(design|clone)$") |
| instruct: str | None = None |
| language: str | None = None |
| speed: float | None = None |
| |
| ref_audio_b64: str | None = None |
| ref_text: str | None = None |
|
|
|
|
| class MultiSpeechRequest(BaseModel): |
| text: str |
| characters: list[CharacterConfig] = Field(default_factory=list) |
| |
| |
| |
| narrator: CharacterConfig | None = None |
| |
| speed: float = 1.0 |
| num_step: int = 32 |
| guidance_scale: float = 2.0 |
| denoise: bool = True |
| preprocess_prompt: bool = True |
| postprocess_output: bool = True |
| |
| inter_segment_silence_ms: int = 250 |
|
|
|
|
| def _split_into_segments( |
| text: str, |
| ) -> list[tuple[int | None, str, dict[str, float]]]: |
| """Split `text` into (slot|None, chunk_text, attrs) segments. |
| |
| Slot is None for narration (text outside any [Sn]…[/Sn] block). |
| Empty / whitespace-only chunks are dropped. `attrs` carries optional |
| metadata parsed from the speaker tag (currently `duration`). |
| """ |
| segments: list[tuple[int | None, str, dict[str, float]]] = [] |
| cursor = 0 |
| for match in _SPEAKER_TAG_RE.finditer(text): |
| before = text[cursor : match.start()] |
| if before.strip(): |
| segments.append((None, before.strip(), {})) |
| slot = int(match.group(1)) |
| attrs_str = match.group(2) or "" |
| inner = match.group(3).strip() |
| attrs: dict[str, float] = {} |
| dur = _parse_duration_attr(attrs_str) |
| if dur is not None and dur > 0: |
| attrs["duration"] = dur |
| if inner: |
| segments.append((slot, inner, attrs)) |
| cursor = match.end() |
| tail = text[cursor:] |
| if tail.strip(): |
| segments.append((None, tail.strip(), {})) |
| return segments |
|
|
|
|
| def _decode_ref_audio(b64: str) -> bytes: |
| try: |
| return base64.b64decode(b64, validate=True) |
| except (binascii.Error, ValueError) as exc: |
| raise HTTPException( |
| status_code=400, detail=f"Invalid ref_audio_b64: {exc}" |
| ) from exc |
|
|
|
|
| def _silence_pcm(milliseconds: int) -> np.ndarray: |
| samples = max(0, int(SAMPLE_RATE * (milliseconds / 1000.0))) |
| return np.zeros(samples, dtype=np.float32) |
|
|
|
|
| def _wav_bytes_to_float_pcm(buf: bytes) -> np.ndarray: |
| """Read a 16-bit PCM mono WAV (any sample rate) and return float32 [-1, 1]. |
| |
| If the sample rate doesn't match SAMPLE_RATE we keep it as-is and rely on |
| the model emitting at SAMPLE_RATE; this is a defensive helper used only |
| when we round-trip WAVs (we never receive WAVs from the model — we get |
| raw float arrays — so this branch is mostly here for testing). |
| """ |
| with wave.open(io.BytesIO(buf), "rb") as wav: |
| n = wav.getnframes() |
| raw = wav.readframes(n) |
| pcm16 = np.frombuffer(raw, dtype=np.int16) |
| return (pcm16.astype(np.float32) / 32767.0).copy() |
|
|
|
|
| def _generate_chunk( |
| *, |
| text: str, |
| character: CharacterConfig | None, |
| common_speed: float, |
| num_step: int, |
| guidance_scale: float, |
| denoise: bool, |
| preprocess_prompt: bool, |
| postprocess_output: bool, |
| block_duration: float | None = None, |
| ) -> np.ndarray: |
| """Generate a single chunk with the given character and return a float32 PCM array.""" |
| if model is None: |
| raise HTTPException(status_code=503, detail=startup_error or "Model not ready") |
|
|
| mode = "auto" |
| instruct = None |
| language = None |
| speed = common_speed |
| ref_audio_path: str | None = None |
| ref_text: str | None = None |
| cleanup_paths: list[str] = [] |
|
|
| try: |
| if character is not None: |
| mode = character.mode |
| instruct = (character.instruct or "").strip() or None |
| language = _normalize_language(character.language) |
| if character.speed is not None: |
| speed = character.speed |
| if mode == "clone": |
| if not character.ref_audio_b64: |
| raise HTTPException( |
| status_code=400, |
| detail=( |
| f"Character S{character.slot} (mode=clone) requires ref_audio_b64" |
| ), |
| ) |
| audio_bytes = _decode_ref_audio(character.ref_audio_b64) |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
| tmp.write(audio_bytes) |
| ref_audio_path = tmp.name |
| cleanup_paths.append(ref_audio_path) |
| ref_text = (character.ref_text or "").strip() or None |
|
|
| gen_config = OmniVoiceGenerationConfig( |
| num_step=num_step, |
| guidance_scale=guidance_scale, |
| denoise=denoise, |
| preprocess_prompt=preprocess_prompt, |
| postprocess_output=postprocess_output, |
| ) |
| kw: dict[str, Any] = { |
| "text": text, |
| "language": language, |
| "generation_config": gen_config, |
| } |
| |
| |
| if block_duration is not None and block_duration > 0: |
| kw["duration"] = float(block_duration) |
| elif speed != 1.0: |
| kw["speed"] = speed |
| if mode == "clone" and ref_audio_path: |
| kw["voice_clone_prompt"] = model.create_voice_clone_prompt( |
| ref_audio=ref_audio_path, |
| ref_text=ref_text, |
| ) |
| if instruct: |
| kw["instruct"] = instruct |
|
|
| try: |
| generated = model.generate(**kw) |
| except HTTPException: |
| raise |
| except Exception as exc: |
| logging.exception("OmniVoice multi-character chunk failed") |
| raise HTTPException( |
| status_code=500, |
| detail=f"{type(exc).__name__}: {exc}", |
| ) from exc |
|
|
| |
| chunk = generated[0] |
| if isinstance(chunk, torch.Tensor): |
| chunk = chunk.detach().cpu().float().numpy() |
| chunk = np.asarray(chunk, dtype=np.float32).reshape(-1) |
| return chunk |
| finally: |
| for path in cleanup_paths: |
| if path and os.path.exists(path): |
| try: |
| os.unlink(path) |
| except OSError: |
| pass |
|
|
|
|
| def _concat_pcm(arrays: list[np.ndarray], silence_samples: int) -> np.ndarray: |
| if not arrays: |
| return np.zeros(0, dtype=np.float32) |
| if len(arrays) == 1: |
| return arrays[0] |
| silence = np.zeros(silence_samples, dtype=np.float32) |
| out: list[np.ndarray] = [] |
| for i, arr in enumerate(arrays): |
| if i > 0: |
| out.append(silence) |
| out.append(arr) |
| return np.concatenate(out, dtype=np.float32) |
|
|
|
|
| @app.post("/v1/audio/speech/multi") |
| async def synth_multi_speech(req: MultiSpeechRequest) -> StreamingResponse: |
| text = (req.text or "").strip() |
| if not text: |
| raise HTTPException(status_code=400, detail="text is required") |
|
|
| characters_by_slot: dict[int, CharacterConfig] = {c.slot: c for c in req.characters} |
|
|
| segments = _split_into_segments(text) |
| if not segments: |
| raise HTTPException( |
| status_code=400, detail="text has no generatable content after parsing" |
| ) |
|
|
| referenced_slots = {slot for slot, _, _ in segments if slot is not None} |
| missing = sorted(referenced_slots - characters_by_slot.keys()) |
| if missing: |
| raise HTTPException( |
| status_code=400, |
| detail=f"Missing character config for slot(s): {missing}", |
| ) |
|
|
| chunks: list[np.ndarray] = [] |
| |
| |
| |
| async with _dub.GPU_LOCK: |
| for slot, chunk_text, attrs in segments: |
| pause_seconds = _parse_pause_seconds(chunk_text) |
| if pause_seconds is not None and pause_seconds > 0: |
| chunks.append(_silence_pcm(int(pause_seconds * 1000))) |
| continue |
|
|
| character = ( |
| characters_by_slot.get(slot) if slot is not None else req.narrator |
| ) |
| |
| |
| |
| block_duration = attrs.get("duration") if attrs else None |
| pcm = await asyncio.to_thread( |
| _generate_chunk, |
| text=chunk_text, |
| character=character, |
| common_speed=req.speed, |
| num_step=req.num_step, |
| guidance_scale=req.guidance_scale, |
| denoise=req.denoise, |
| preprocess_prompt=req.preprocess_prompt, |
| postprocess_output=req.postprocess_output, |
| block_duration=block_duration, |
| ) |
| chunks.append(pcm) |
|
|
| silence_samples = max(0, int(SAMPLE_RATE * (req.inter_segment_silence_ms / 1000.0))) |
| combined = _concat_pcm(chunks, silence_samples=silence_samples) |
| wav_bytes = _wav_bytes(combined) |
| headers = { |
| "X-OmniVoice-Mode": "multi", |
| "X-OmniVoice-Segments": str(len(segments)), |
| } |
| return StreamingResponse( |
| io.BytesIO(wav_bytes), media_type="audio/wav", headers=headers |
| ) |
|
|
|
|
| @app.post("/v1/audio/speech/clone") |
| async def clone_speech_compat( |
| text: str = Form(...), |
| ref_audio: UploadFile = File(...), |
| ref_text: str | None = Form(None), |
| instruct: str | None = Form(None), |
| language: str | None = Form(None), |
| speed: float = Form(1.0), |
| num_step: int = Form(32), |
| guidance_scale: float = Form(2.0), |
| ) -> StreamingResponse: |
| """Backward-compat shim — same as POST /v1/audio/speech with mode=clone.""" |
| return await synth_speech( |
| text=text, |
| mode="clone", |
| ref_audio=ref_audio, |
| ref_text=ref_text, |
| instruct=instruct, |
| language=language, |
| speed=speed, |
| duration=None, |
| num_step=num_step, |
| guidance_scale=guidance_scale, |
| denoise=None, |
| preprocess_prompt=None, |
| postprocess_output=None, |
| ) |
|
|