nvidia
/

audio-flamingo-3

Audio-Text-to-Text

audio understanding

Model card Files Files and versions

SreyanG-NVIDIA commited on 15 days ago

Commit

b1929d6

·

verified ·

1 Parent(s): b261166

Add think-mode peft example

Files changed (1) hide show

README.md +53 -0

README.md CHANGED Viewed

@@ -245,6 +245,59 @@ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
 print(decoded_outputs)
 ```
 ### Training / Fine-tuning
 ```python

 print(decoded_outputs)
 ```
+### Think-mode reasoning with PEFT adapter (AF-Think)
+```python
+import os
+import torch
+from huggingface_hub import snapshot_download
+from peft import PeftModel
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+local_id = snapshot_download(model_id)
+processor = AutoProcessor.from_pretrained(local_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(local_id, device_map="auto")
+non_lora_path = os.path.join(local_id, "think", "non_lora_trainables.bin")
+non_lora_trainables = torch.load(non_lora_path)
+model.load_state_dict(non_lora_trainables, strict=False)
+model = PeftModel.from_pretrained(model, local_id, subfolder="think")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely.\nPlease think and reason about the input music before you respond.",
+            },
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/videoplayback_superman.wav",
+            },
+        ],
+    }
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=1024)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
+print(decoded_outputs)
+```
 ### Training / Fine-tuning
 ```python