Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- .gitattributes +1 -0
- Dockerfile +18 -0
- app/main.py +38 -0
- app/rvc_infer.py +48 -0
- models/Chiyu_v2_48k.pth +3 -0
- models/added_IVF256_Flat_nprobe_1_Chiyu_v2_48k_v2.index +3 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
models/Chiyu_v2_48k/voice.index filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
models/Chiyu_v2_48k/voice.index filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
models/added_IVF256_Flat_nprobe_1_Chiyu_v2_48k_v2.index filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# System deps
|
| 6 |
+
RUN apt-get update && apt-get install -y ffmpeg git && rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
# Python deps
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# Copy app + models
|
| 13 |
+
COPY app/ ./app/
|
| 14 |
+
COPY models/ ./models/
|
| 15 |
+
|
| 16 |
+
EXPOSE 7860
|
| 17 |
+
|
| 18 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/main.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import asyncio
|
| 3 |
+
import tempfile
|
| 4 |
+
import subprocess
|
| 5 |
+
from fastapi import FastAPI, Form
|
| 6 |
+
from fastapi.responses import FileResponse
|
| 7 |
+
import edge_tts
|
| 8 |
+
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
|
| 11 |
+
MODEL_PATH = "models/Chiyu_v2_48k.pth"
|
| 12 |
+
INDEX_PATH = "models/added_IVF256_Flat_nprobe_1_Chiyu_v2_48k_v2.index"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@app.post("/speak")
|
| 16 |
+
async def speak(text: str = Form(...)):
|
| 17 |
+
# Step 1: Generate TTS with edge-tts
|
| 18 |
+
tts = edge_tts.Communicate(text, voice="en-US-AriaNeural")
|
| 19 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_tts:
|
| 20 |
+
await tts.save(tmp_tts.name)
|
| 21 |
+
tts_path = tmp_tts.name
|
| 22 |
+
|
| 23 |
+
# Step 2: Run RVC conversion
|
| 24 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_out:
|
| 25 |
+
out_path = tmp_out.name
|
| 26 |
+
|
| 27 |
+
cmd = [
|
| 28 |
+
"python3",
|
| 29 |
+
"app/rvc_infer.py",
|
| 30 |
+
"--input", tts_path,
|
| 31 |
+
"--output", out_path,
|
| 32 |
+
"--model", MODEL_PATH,
|
| 33 |
+
"--index", INDEX_PATH,
|
| 34 |
+
]
|
| 35 |
+
subprocess.run(cmd, check=True)
|
| 36 |
+
|
| 37 |
+
# Step 3: Return audio file
|
| 38 |
+
return FileResponse(out_path, media_type="audio/wav", filename="output.wav")
|
app/rvc_infer.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import torch
|
| 3 |
+
import librosa
|
| 4 |
+
import soundfile as sf
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
# Core RVC imports
|
| 8 |
+
from rvc.model_infer import SynthesizerTrn
|
| 9 |
+
from rvc import utils
|
| 10 |
+
from rvc.modules.vc.pipeline import VC
|
| 11 |
+
import faiss
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main():
|
| 15 |
+
parser = argparse.ArgumentParser()
|
| 16 |
+
parser.add_argument("--input", required=True)
|
| 17 |
+
parser.add_argument("--output", required=True)
|
| 18 |
+
parser.add_argument("--model", required=True)
|
| 19 |
+
parser.add_argument("--index", required=True)
|
| 20 |
+
args = parser.parse_args()
|
| 21 |
+
|
| 22 |
+
# Load input audio
|
| 23 |
+
audio, sr = librosa.load(args.input, sr=48000)
|
| 24 |
+
|
| 25 |
+
# Initialize model
|
| 26 |
+
device = torch.device("cpu") # <- HF free tier is CPU only
|
| 27 |
+
vc = VC(args.model, args.index, device)
|
| 28 |
+
|
| 29 |
+
# Convert
|
| 30 |
+
converted, _ = vc.vc_single(
|
| 31 |
+
sid=0, # Speaker ID (default: 0)
|
| 32 |
+
input_audio=audio,
|
| 33 |
+
input_sr=sr,
|
| 34 |
+
f0_up_key=0, # Pitch shift (0 = none)
|
| 35 |
+
f0_method="crepe",# Pitch extractor ("pm", "harvest", "crepe")
|
| 36 |
+
index_rate=0.75, # Weight for index feature
|
| 37 |
+
filter_radius=3,
|
| 38 |
+
resample_sr=0,
|
| 39 |
+
rms_mix_rate=0.25,
|
| 40 |
+
protect=0.33,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Save output
|
| 44 |
+
sf.write(args.output, converted, 48000)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
main()
|
models/Chiyu_v2_48k.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69cafbdd228bcd96736f064fc7943d34fcccf8fd8cdf95ef1941559d1a577dfb
|
| 3 |
+
size 57581999
|
models/added_IVF256_Flat_nprobe_1_Chiyu_v2_48k_v2.index
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c5d3991205e84fd10517ed16ee1cdc738529845effb5e42813f926db6d842eb
|
| 3 |
+
size 31588619
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
edge-tts
|
| 4 |
+
librosa
|
| 5 |
+
soundfile
|
| 6 |
+
torch
|
| 7 |
+
torchaudio
|
| 8 |
+
faiss-cpu
|
| 9 |
+
numpy
|