For those who need a simplified execution on NVIDIA GPU
#21
by
ghostplant
- opened
Server: One-step booting on single A100 GPU:
pip3 install -U "huggingface_hub[cli]" --upgrade
hf download Qwen/Qwen2.5-1.5B --local-dir Qwen/Qwen2.5-1.5B
hf download aoi-ot/VibeVoice-1.5B --local-dir aoi-ot/VibeVoice-1.5B
docker run -e VOICES="https://homepages.inf.ed.ac.uk/htang2/notes/speech-samples/103-1240-0000.wav" \
-e LOCAL_SIZE=1 -e WORKER=2 -it --rm --ipc=host --net=host --shm-size=8g --ulimit memlock=-1 --ulimit stack=67108864 \
-v /:/host -w /host$(pwd) -v /tmp:/tmp -v /usr/lib/x86_64-linux-gnu/libcuda.so.1:/usr/lib/x86_64-linux-gnu/libcuda.so.1 --privileged \
tutelgroup/deepseek-671b:a100x8-chat-20251222 --try_path microsoft/VibeVoice-1.5B
Client: Audio generation (Chinese/English)
curl -X POST http://0.0.0.0:8000/chat -d '{"text": "VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, such as podcasts, from text."}' > sound_output.mp3