added technical report
Browse files- .gitattributes +1 -0
- README.md +31 -42
- phi_4_mm.tech_report.02252025.pdf +3 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -42,13 +42,13 @@ Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
|
|
| 42 |
- Vision: English
|
| 43 |
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
|
| 44 |
|
| 45 |
-
π‘ [Phi-4-multimodal Portal]() <br>
|
| 46 |
-
π° [Phi-4-multimodal Microsoft Blog]() <br>
|
| 47 |
-
π [Phi-4-multimodal Technical Report]() <br>
|
| 48 |
-
π©βπ³ [Phi-4-multimodal Cookbook]() <br>
|
| 49 |
π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
|
| 50 |
|
| 51 |
-
**Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi-
|
| 52 |
|
| 53 |
## Intended Uses
|
| 54 |
|
|
@@ -218,10 +218,14 @@ torch==2.6.0
|
|
| 218 |
transformers==4.48.2
|
| 219 |
accelerate==1.3.0
|
| 220 |
soundfile==0.13.1
|
| 221 |
-
pillow==
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
```
|
| 223 |
|
| 224 |
-
Phi-4-multimodal-instruct is also available in [Azure AI Studio]()
|
| 225 |
|
| 226 |
### Tokenizer
|
| 227 |
|
|
@@ -324,7 +328,7 @@ If it is a square image, the resolution would be around (8*448 by 8*448). For mu
|
|
| 324 |
|
| 325 |
### Loading the model locally
|
| 326 |
|
| 327 |
-
After obtaining the Phi-4-
|
| 328 |
|
| 329 |
```python
|
| 330 |
import requests
|
|
@@ -334,6 +338,8 @@ import io
|
|
| 334 |
from PIL import Image
|
| 335 |
import soundfile as sf
|
| 336 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
|
|
|
|
|
|
| 337 |
|
| 338 |
# Define model path
|
| 339 |
model_path = "microsoft/Phi-4-multimodal-instruct"
|
|
@@ -380,44 +386,27 @@ print(f'>>> Response\n{response}')
|
|
| 380 |
|
| 381 |
# Part 2: Audio Processing
|
| 382 |
print("\n--- AUDIO PROCESSING ---")
|
| 383 |
-
audio_url = "https://
|
| 384 |
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
|
| 385 |
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
|
| 386 |
print(f'>>> Prompt\n{prompt}')
|
| 387 |
|
| 388 |
-
#
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
max_new_tokens=1000,
|
| 405 |
-
generation_config=generation_config,
|
| 406 |
-
)
|
| 407 |
-
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
| 408 |
-
response = processor.batch_decode(
|
| 409 |
-
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 410 |
-
)[0]
|
| 411 |
-
print(f'>>> Response\n{response}')
|
| 412 |
-
|
| 413 |
-
# Clean up
|
| 414 |
-
try:
|
| 415 |
-
os.remove(temp_audio_path)
|
| 416 |
-
print(f"Temporary file {temp_audio_path} removed successfully")
|
| 417 |
-
except Exception as e:
|
| 418 |
-
print(f"Error removing temporary file: {e}")
|
| 419 |
-
else:
|
| 420 |
-
print(f"Failed to download audio file: {audio_response.status_code}")
|
| 421 |
```
|
| 422 |
|
| 423 |
## Responsible AI Considerations
|
|
|
|
| 42 |
- Vision: English
|
| 43 |
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
|
| 44 |
|
| 45 |
+
π‘ [Phi-4-multimodal Portal](https://aka.ms/phi-4-multimodal/azure) <br>
|
| 46 |
+
π° [Phi-4-multimodal Microsoft Blog](https://aka.ms/phi4techblog-feb2025) <br>
|
| 47 |
+
π [Phi-4-multimodal Technical Report](https://aka.ms/phi-4-multimodal/techreport) <br>
|
| 48 |
+
π©βπ³ [Phi-4-multimodal Cookbook](https://github.com/microsoft/PhiCookBook) <br>
|
| 49 |
π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
|
| 50 |
|
| 51 |
+
**Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | [onnx](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)]; [[mini-instruct]](https://huggingface.co/microsoft/Phi-4-mini-instruct);
|
| 52 |
|
| 53 |
## Intended Uses
|
| 54 |
|
|
|
|
| 218 |
transformers==4.48.2
|
| 219 |
accelerate==1.3.0
|
| 220 |
soundfile==0.13.1
|
| 221 |
+
pillow==11.1.0
|
| 222 |
+
scipy==1.15.2
|
| 223 |
+
torchvision==0.21.0
|
| 224 |
+
backoff==2.2.1
|
| 225 |
+
peft==0.13.2
|
| 226 |
```
|
| 227 |
|
| 228 |
+
Phi-4-multimodal-instruct is also available in [Azure AI Studio](https://aka.ms/phi-4-multimodal/azure)
|
| 229 |
|
| 230 |
### Tokenizer
|
| 231 |
|
|
|
|
| 328 |
|
| 329 |
### Loading the model locally
|
| 330 |
|
| 331 |
+
After obtaining the Phi-4-multimodal-instruct model checkpoints, users can use this sample code for inference.
|
| 332 |
|
| 333 |
```python
|
| 334 |
import requests
|
|
|
|
| 338 |
from PIL import Image
|
| 339 |
import soundfile as sf
|
| 340 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
| 341 |
+
from urllib.request import urlopen
|
| 342 |
+
|
| 343 |
|
| 344 |
# Define model path
|
| 345 |
model_path = "microsoft/Phi-4-multimodal-instruct"
|
|
|
|
| 386 |
|
| 387 |
# Part 2: Audio Processing
|
| 388 |
print("\n--- AUDIO PROCESSING ---")
|
| 389 |
+
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
|
| 390 |
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
|
| 391 |
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
|
| 392 |
print(f'>>> Prompt\n{prompt}')
|
| 393 |
|
| 394 |
+
# Downlowd and open audio file
|
| 395 |
+
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
|
| 396 |
+
|
| 397 |
+
# Process with the model
|
| 398 |
+
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
|
| 399 |
+
|
| 400 |
+
generate_ids = model.generate(
|
| 401 |
+
**inputs,
|
| 402 |
+
max_new_tokens=1000,
|
| 403 |
+
generation_config=generation_config,
|
| 404 |
+
)
|
| 405 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
| 406 |
+
response = processor.batch_decode(
|
| 407 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 408 |
+
)[0]
|
| 409 |
+
print(f'>>> Response\n{response}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
```
|
| 411 |
|
| 412 |
## Responsible AI Considerations
|
phi_4_mm.tech_report.02252025.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5469d9123cbee2b41729db3217cacfeaa96eaf543868caa2eeec7cf2d24547d
|
| 3 |
+
size 5295165
|