Spaces:
Runtime error
Runtime error
Hugo Flores Garcia
commited on
Commit
·
4c6c719
1
Parent(s):
fff28a2
pin numy
Browse files- README.md +0 -5
- demo.py → app.py +25 -57
- setup.py +1 -0
README.md
CHANGED
|
@@ -41,11 +41,6 @@ Download the pretrained models from [this link](https://zenodo.org/record/813654
|
|
| 41 |
|
| 42 |
# Usage
|
| 43 |
|
| 44 |
-
First, you'll want to set up your environment
|
| 45 |
-
```bash
|
| 46 |
-
source ./env/env.sh
|
| 47 |
-
```
|
| 48 |
-
|
| 49 |
## Launching the Gradio Interface
|
| 50 |
You can launch a gradio UI to play with vampnet.
|
| 51 |
|
|
|
|
| 41 |
|
| 42 |
# Usage
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
## Launching the Gradio Interface
|
| 45 |
You can launch a gradio UI to play with vampnet.
|
| 46 |
|
demo.py → app.py
RENAMED
|
@@ -32,15 +32,6 @@ dataset = at.data.datasets.AudioDataset(
|
|
| 32 |
)
|
| 33 |
|
| 34 |
|
| 35 |
-
checkpoints = {
|
| 36 |
-
"vampnet": {
|
| 37 |
-
"coarse": "./models/vampnet/coarse.pth",
|
| 38 |
-
"c2f": "./models/vampnet/c2f.pth",
|
| 39 |
-
"codec": "./models/vampnet/codec.pth",
|
| 40 |
-
"full_ckpt": True
|
| 41 |
-
},
|
| 42 |
-
}
|
| 43 |
-
interface.checkpoint_key = "vampnet"
|
| 44 |
|
| 45 |
|
| 46 |
OUT_DIR = Path("gradio-outputs")
|
|
@@ -74,23 +65,10 @@ def load_random_audio():
|
|
| 74 |
|
| 75 |
|
| 76 |
def _vamp(data, return_mask=False):
|
| 77 |
-
|
| 78 |
-
# if our checkpoint key is different, we need to load a new checkpoint
|
| 79 |
-
if data[checkpoint_key] != interface.checkpoint_key:
|
| 80 |
-
print(f"loading checkpoint {data[checkpoint_key]}")
|
| 81 |
-
interface.lora_load(
|
| 82 |
-
checkpoints[data[checkpoint_key]]["coarse"],
|
| 83 |
-
checkpoints[data[checkpoint_key]]["c2f"],
|
| 84 |
-
checkpoints[data[checkpoint_key]]["full_ckpt"],
|
| 85 |
-
)
|
| 86 |
-
interface.checkpoint_key = data[checkpoint_key]
|
| 87 |
-
|
| 88 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
| 89 |
out_dir.mkdir()
|
| 90 |
sig = at.AudioSignal(data[input_audio])
|
| 91 |
|
| 92 |
-
# TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
|
| 93 |
-
|
| 94 |
z = interface.encode(sig)
|
| 95 |
|
| 96 |
ncc = data[n_conditioning_codebooks]
|
|
@@ -211,10 +189,7 @@ with gr.Blocks() as demo:
|
|
| 211 |
|
| 212 |
with gr.Row():
|
| 213 |
with gr.Column():
|
| 214 |
-
|
| 215 |
-
label="use coarse2fine",
|
| 216 |
-
value=True
|
| 217 |
-
)
|
| 218 |
|
| 219 |
manual_audio_upload = gr.File(
|
| 220 |
label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
|
|
@@ -250,38 +225,17 @@ with gr.Blocks() as demo:
|
|
| 250 |
# mask settings
|
| 251 |
with gr.Column():
|
| 252 |
|
| 253 |
-
input_pitch_shift = gr.Slider(
|
| 254 |
-
label="input pitch shift (semitones)",
|
| 255 |
-
minimum=-36,
|
| 256 |
-
maximum=36,
|
| 257 |
-
step=1,
|
| 258 |
-
value=0,
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
rand_mask_intensity = gr.Slider(
|
| 262 |
-
label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
|
| 263 |
-
minimum=0.0,
|
| 264 |
-
maximum=1.0,
|
| 265 |
-
value=1.0
|
| 266 |
-
)
|
| 267 |
-
|
| 268 |
periodic_p = gr.Slider(
|
| 269 |
-
label="periodic prompt (0.0 means no
|
| 270 |
minimum=0,
|
| 271 |
maximum=128,
|
| 272 |
step=1,
|
| 273 |
value=3,
|
| 274 |
)
|
| 275 |
-
|
| 276 |
-
label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
|
| 277 |
-
minimum=1,
|
| 278 |
-
maximum=20,
|
| 279 |
-
step=1,
|
| 280 |
-
value=1,
|
| 281 |
-
)
|
| 282 |
|
| 283 |
onset_mask_width = gr.Slider(
|
| 284 |
-
label="onset mask width (
|
| 285 |
minimum=0,
|
| 286 |
maximum=20,
|
| 287 |
step=1,
|
|
@@ -301,6 +255,20 @@ with gr.Blocks() as demo:
|
|
| 301 |
|
| 302 |
|
| 303 |
with gr.Accordion("extras ", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
n_conditioning_codebooks = gr.Number(
|
| 305 |
label="number of conditioning codebooks. probably 0",
|
| 306 |
value=0,
|
|
@@ -337,6 +305,8 @@ with gr.Blocks() as demo:
|
|
| 337 |
value=0.8
|
| 338 |
)
|
| 339 |
|
|
|
|
|
|
|
| 340 |
with gr.Accordion("sampling settings", open=False):
|
| 341 |
typical_filtering = gr.Checkbox(
|
| 342 |
label="typical filtering ",
|
|
@@ -356,6 +326,11 @@ with gr.Blocks() as demo:
|
|
| 356 |
value=64
|
| 357 |
)
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
num_steps = gr.Slider(
|
| 360 |
label="number of steps (should normally be between 12 and 36)",
|
| 361 |
minimum=1,
|
|
@@ -375,11 +350,6 @@ with gr.Blocks() as demo:
|
|
| 375 |
|
| 376 |
# mask settings
|
| 377 |
with gr.Column():
|
| 378 |
-
checkpoint_key = gr.Radio(
|
| 379 |
-
label="checkpoint",
|
| 380 |
-
choices=list(checkpoints.keys()),
|
| 381 |
-
value="spotdl"
|
| 382 |
-
)
|
| 383 |
vamp_button = gr.Button("vamp!!!")
|
| 384 |
output_audio = gr.Audio(
|
| 385 |
label="output audio",
|
|
@@ -414,11 +384,9 @@ with gr.Blocks() as demo:
|
|
| 414 |
use_coarse2fine,
|
| 415 |
stretch_factor,
|
| 416 |
onset_mask_width,
|
| 417 |
-
input_pitch_shift,
|
| 418 |
typical_filtering,
|
| 419 |
typical_mass,
|
| 420 |
typical_min_tokens,
|
| 421 |
-
checkpoint_key,
|
| 422 |
beat_mask_width,
|
| 423 |
beat_mask_downbeats
|
| 424 |
}
|
|
|
|
| 32 |
)
|
| 33 |
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
OUT_DIR = Path("gradio-outputs")
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def _vamp(data, return_mask=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
out_dir = OUT_DIR / str(uuid.uuid4())
|
| 69 |
out_dir.mkdir()
|
| 70 |
sig = at.AudioSignal(data[input_audio])
|
| 71 |
|
|
|
|
|
|
|
| 72 |
z = interface.encode(sig)
|
| 73 |
|
| 74 |
ncc = data[n_conditioning_codebooks]
|
|
|
|
| 189 |
|
| 190 |
with gr.Row():
|
| 191 |
with gr.Column():
|
| 192 |
+
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
manual_audio_upload = gr.File(
|
| 195 |
label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
|
|
|
|
| 225 |
# mask settings
|
| 226 |
with gr.Column():
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
periodic_p = gr.Slider(
|
| 229 |
+
label="periodic prompt (0.0 means no prompt, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
|
| 230 |
minimum=0,
|
| 231 |
maximum=128,
|
| 232 |
step=1,
|
| 233 |
value=3,
|
| 234 |
)
|
| 235 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
onset_mask_width = gr.Slider(
|
| 238 |
+
label="onset mask width (multiplies with the periodic mask, 1 step ~= 10milliseconds) ",
|
| 239 |
minimum=0,
|
| 240 |
maximum=20,
|
| 241 |
step=1,
|
|
|
|
| 255 |
|
| 256 |
|
| 257 |
with gr.Accordion("extras ", open=False):
|
| 258 |
+
rand_mask_intensity = gr.Slider(
|
| 259 |
+
label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
|
| 260 |
+
minimum=0.0,
|
| 261 |
+
maximum=1.0,
|
| 262 |
+
value=1.0
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
periodic_w = gr.Slider(
|
| 266 |
+
label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
|
| 267 |
+
minimum=1,
|
| 268 |
+
maximum=20,
|
| 269 |
+
step=1,
|
| 270 |
+
value=1,
|
| 271 |
+
)
|
| 272 |
n_conditioning_codebooks = gr.Number(
|
| 273 |
label="number of conditioning codebooks. probably 0",
|
| 274 |
value=0,
|
|
|
|
| 305 |
value=0.8
|
| 306 |
)
|
| 307 |
|
| 308 |
+
|
| 309 |
+
|
| 310 |
with gr.Accordion("sampling settings", open=False):
|
| 311 |
typical_filtering = gr.Checkbox(
|
| 312 |
label="typical filtering ",
|
|
|
|
| 326 |
value=64
|
| 327 |
)
|
| 328 |
|
| 329 |
+
use_coarse2fine = gr.Checkbox(
|
| 330 |
+
label="use coarse2fine",
|
| 331 |
+
value=True
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
num_steps = gr.Slider(
|
| 335 |
label="number of steps (should normally be between 12 and 36)",
|
| 336 |
minimum=1,
|
|
|
|
| 350 |
|
| 351 |
# mask settings
|
| 352 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
vamp_button = gr.Button("vamp!!!")
|
| 354 |
output_audio = gr.Audio(
|
| 355 |
label="output audio",
|
|
|
|
| 384 |
use_coarse2fine,
|
| 385 |
stretch_factor,
|
| 386 |
onset_mask_width,
|
|
|
|
| 387 |
typical_filtering,
|
| 388 |
typical_mass,
|
| 389 |
typical_min_tokens,
|
|
|
|
| 390 |
beat_mask_width,
|
| 391 |
beat_mask_downbeats
|
| 392 |
}
|
setup.py
CHANGED
|
@@ -28,6 +28,7 @@ setup(
|
|
| 28 |
install_requires=[
|
| 29 |
"torch",
|
| 30 |
"argbind>=0.3.2",
|
|
|
|
| 31 |
# "audiotools @ git+https://github.com/descriptinc/audiotools.git@f35914b5b3c6f1bf589cd09481478d741538828e",
|
| 32 |
# "dac @ git+https://github.com/descriptinc/descript-audio-codec.git",
|
| 33 |
"gradio",
|
|
|
|
| 28 |
install_requires=[
|
| 29 |
"torch",
|
| 30 |
"argbind>=0.3.2",
|
| 31 |
+
"numpy==1.22",
|
| 32 |
# "audiotools @ git+https://github.com/descriptinc/audiotools.git@f35914b5b3c6f1bf589cd09481478d741538828e",
|
| 33 |
# "dac @ git+https://github.com/descriptinc/descript-audio-codec.git",
|
| 34 |
"gradio",
|