For those who need a simplified execution on NVIDIA GPU

#21
by ghostplant - opened

Server: One-step booting on single A100 GPU:

pip3 install -U "huggingface_hub[cli]" --upgrade
hf download Qwen/Qwen2.5-1.5B --local-dir Qwen/Qwen2.5-1.5B
hf download aoi-ot/VibeVoice-1.5B --local-dir aoi-ot/VibeVoice-1.5B

docker run -e VOICES="https://homepages.inf.ed.ac.uk/htang2/notes/speech-samples/103-1240-0000.wav" \
  -e LOCAL_SIZE=1 -e WORKER=2 -it --rm --ipc=host --net=host --shm-size=8g       --ulimit memlock=-1 --ulimit stack=67108864 \
  -v /:/host -w /host$(pwd) -v /tmp:/tmp -v /usr/lib/x86_64-linux-gnu/libcuda.so.1:/usr/lib/x86_64-linux-gnu/libcuda.so.1 --privileged \
    tutelgroup/deepseek-671b:a100x8-chat-20251222         --try_path microsoft/VibeVoice-1.5B

Client: Audio generation (Chinese/English)

curl -X POST http://0.0.0.0:8000/chat -d '{"text": "VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, such as podcasts, from text."}' > sound_output.mp3

Sign up or log in to comment