Update README.md

Browse files

Files changed (1) hide show

README.md +55 -0

README.md CHANGED Viewed

	@@ -14,3 +14,58 @@ when using the initial version, the decoder ((autoencoder_arm.onnx)) crashes the
14
15	nothing to see here, yet... just wanted a place to store these.
16

 nothing to see here, yet... just wanted a place to store these.
+like everything else i do...pure vibes zero real knowledge.
+Here's a python script i used to validate outputs against the original pytorch model.
+there's another one using cfg stuff that gets essentially the same outputs.
+```
+#!/usr/bin/env python
+import numpy as np, soundfile as sf, onnxruntime as ort
+from transformers import AutoTokenizer
+# Load ONNX models
+dit = ort.InferenceSession("diffusion_dit_arm.onnx")
+cond = ort.InferenceSession("conditioners.onnx")
+dec  = ort.InferenceSession("autoencoder_arm.onnx")
+# Config
+prompt = "lo-fi hip-hop beat with pianos 90bpm"
+steps = 10
+rng = np.random.RandomState(12345)
+x = rng.randn(1, 64, 256).astype(np.float32)
+# Conditioning
+tok = AutoTokenizer.from_pretrained("t5-base")
+tokens = tok(prompt, truncation=True, padding="max_length", max_length=128, return_tensors="np")
+conds = cond.run(None, {
+    "input_ids": tokens["input_ids"].astype(np.int64),
+    "attention_mask": tokens["attention_mask"].astype(np.int64),
+    "seconds_total": np.array([10.0], dtype=np.float32)
+})
+cross, _, glob = conds
+# Run 10 steps with linear t, no CFG
+for i in range(steps):
+    t_val = 1.0 - i / (steps - 1)
+    t = np.array([t_val], dtype=np.float32)
+    v = dit.run(None, {
+        "x": x, "t": t,
+        "cross_attn_cond": cross,
+        "global_cond": glob
+    })[0]
+    x -= 0.1 * v  # fixed Euler step
+# Decode
+audio = dec.run(None, {'sampled': x})[0]
+if audio.shape[0] == 2:
+    audio = audio.T
+audio /= np.abs(audio).max()
+sf.write("onnx_lofi_linear.wav", audio, 44100)
+print("✅ onnx_lofi_linear.wav written!")
+```