SreyanG-NVIDIA commited on
Commit
b1929d6
·
verified ·
1 Parent(s): b261166

Add think-mode peft example

Browse files
Files changed (1) hide show
  1. README.md +53 -0
README.md CHANGED
@@ -245,6 +245,59 @@ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
245
  print(decoded_outputs)
246
  ```
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  ### Training / Fine-tuning
249
 
250
  ```python
 
245
  print(decoded_outputs)
246
  ```
247
 
248
+ ### Think-mode reasoning with PEFT adapter (AF-Think)
249
+
250
+ ```python
251
+ import os
252
+
253
+ import torch
254
+ from huggingface_hub import snapshot_download
255
+ from peft import PeftModel
256
+
257
+ from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
258
+
259
+
260
+ model_id = "nvidia/audio-flamingo-3-hf"
261
+ local_id = snapshot_download(model_id)
262
+
263
+ processor = AutoProcessor.from_pretrained(local_id)
264
+ model = AudioFlamingo3ForConditionalGeneration.from_pretrained(local_id, device_map="auto")
265
+
266
+ non_lora_path = os.path.join(local_id, "think", "non_lora_trainables.bin")
267
+ non_lora_trainables = torch.load(non_lora_path)
268
+ model.load_state_dict(non_lora_trainables, strict=False)
269
+
270
+ model = PeftModel.from_pretrained(model, local_id, subfolder="think")
271
+
272
+ conversation = [
273
+ {
274
+ "role": "user",
275
+ "content": [
276
+ {
277
+ "type": "text",
278
+ "text": "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely.\nPlease think and reason about the input music before you respond.",
279
+ },
280
+ {
281
+ "type": "audio",
282
+ "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/videoplayback_superman.wav",
283
+ },
284
+ ],
285
+ }
286
+ ]
287
+
288
+ inputs = processor.apply_chat_template(
289
+ conversation,
290
+ tokenize=True,
291
+ add_generation_prompt=True,
292
+ return_dict=True,
293
+ ).to(model.device)
294
+
295
+ outputs = model.generate(**inputs, max_new_tokens=1024)
296
+
297
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
298
+ print(decoded_outputs)
299
+ ```
300
+
301
  ### Training / Fine-tuning
302
 
303
  ```python