Add think-mode peft example
Browse files
README.md
CHANGED
|
@@ -245,6 +245,59 @@ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
|
|
| 245 |
print(decoded_outputs)
|
| 246 |
```
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
### Training / Fine-tuning
|
| 249 |
|
| 250 |
```python
|
|
|
|
| 245 |
print(decoded_outputs)
|
| 246 |
```
|
| 247 |
|
| 248 |
+
### Think-mode reasoning with PEFT adapter (AF-Think)
|
| 249 |
+
|
| 250 |
+
```python
|
| 251 |
+
import os
|
| 252 |
+
|
| 253 |
+
import torch
|
| 254 |
+
from huggingface_hub import snapshot_download
|
| 255 |
+
from peft import PeftModel
|
| 256 |
+
|
| 257 |
+
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
model_id = "nvidia/audio-flamingo-3-hf"
|
| 261 |
+
local_id = snapshot_download(model_id)
|
| 262 |
+
|
| 263 |
+
processor = AutoProcessor.from_pretrained(local_id)
|
| 264 |
+
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(local_id, device_map="auto")
|
| 265 |
+
|
| 266 |
+
non_lora_path = os.path.join(local_id, "think", "non_lora_trainables.bin")
|
| 267 |
+
non_lora_trainables = torch.load(non_lora_path)
|
| 268 |
+
model.load_state_dict(non_lora_trainables, strict=False)
|
| 269 |
+
|
| 270 |
+
model = PeftModel.from_pretrained(model, local_id, subfolder="think")
|
| 271 |
+
|
| 272 |
+
conversation = [
|
| 273 |
+
{
|
| 274 |
+
"role": "user",
|
| 275 |
+
"content": [
|
| 276 |
+
{
|
| 277 |
+
"type": "text",
|
| 278 |
+
"text": "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely.\nPlease think and reason about the input music before you respond.",
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"type": "audio",
|
| 282 |
+
"path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/videoplayback_superman.wav",
|
| 283 |
+
},
|
| 284 |
+
],
|
| 285 |
+
}
|
| 286 |
+
]
|
| 287 |
+
|
| 288 |
+
inputs = processor.apply_chat_template(
|
| 289 |
+
conversation,
|
| 290 |
+
tokenize=True,
|
| 291 |
+
add_generation_prompt=True,
|
| 292 |
+
return_dict=True,
|
| 293 |
+
).to(model.device)
|
| 294 |
+
|
| 295 |
+
outputs = model.generate(**inputs, max_new_tokens=1024)
|
| 296 |
+
|
| 297 |
+
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
|
| 298 |
+
print(decoded_outputs)
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
### Training / Fine-tuning
|
| 302 |
|
| 303 |
```python
|