Upload 3 files
Browse files- genai_config.json +38 -0
- speech_processor.json +48 -0
- vision_processor.json +68 -0
genai_config.json
CHANGED
|
@@ -25,6 +25,44 @@
|
|
| 25 |
"num_hidden_layers": 32,
|
| 26 |
"num_key_value_heads": 8
|
| 27 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"eos_token_id": [
|
| 29 |
200020,
|
| 30 |
199999
|
|
|
|
| 25 |
"num_hidden_layers": 32,
|
| 26 |
"num_key_value_heads": 8
|
| 27 |
},
|
| 28 |
+
"vision": {
|
| 29 |
+
"filename": "phi-4-mm-vision.onnx",
|
| 30 |
+
"config_filename": "vision_processor.json",
|
| 31 |
+
"adapter_filename": "phi-4-mm-vision.onnx_adapter",
|
| 32 |
+
"inputs": {
|
| 33 |
+
"pixel_values": "pixel_values",
|
| 34 |
+
"attention_mask": "image_attention_mask",
|
| 35 |
+
"image_sizes": "image_sizes"
|
| 36 |
+
},
|
| 37 |
+
"outputs": {
|
| 38 |
+
"image_features": "image_features"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"speech": {
|
| 42 |
+
"filename": "phi-4-mm-speech.onnx",
|
| 43 |
+
"config_filename": "speech_processor.json",
|
| 44 |
+
"adapter_filename": "phi-4-mm-speech.onnx_adapter",
|
| 45 |
+
"inputs": {
|
| 46 |
+
"audio_embeds": "audio_embeds",
|
| 47 |
+
"attention_mask": "audio_attention_mask",
|
| 48 |
+
"audio_sizes": "audio_sizes",
|
| 49 |
+
"audio_projection_mode": "audio_projection_mode"
|
| 50 |
+
},
|
| 51 |
+
"outputs": {
|
| 52 |
+
"audio_features": "audio_features"
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"embedding": {
|
| 56 |
+
"filename": "phi-4-mm-embedding.onnx",
|
| 57 |
+
"inputs": {
|
| 58 |
+
"input_ids": "input_ids",
|
| 59 |
+
"image_features": "image_features",
|
| 60 |
+
"audio_features": "audio_features"
|
| 61 |
+
},
|
| 62 |
+
"outputs": {
|
| 63 |
+
"inputs_embeds": "inputs_embeds"
|
| 64 |
+
}
|
| 65 |
+
},
|
| 66 |
"eos_token_id": [
|
| 67 |
200020,
|
| 68 |
199999
|
speech_processor.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_extraction": {
|
| 3 |
+
"sequence": [
|
| 4 |
+
{
|
| 5 |
+
"operation": {
|
| 6 |
+
"name": "audio_decoder",
|
| 7 |
+
"type": "AudioDecoderEx",
|
| 8 |
+
"attrs": {
|
| 9 |
+
"target_sample_rates": [
|
| 10 |
+
8000,
|
| 11 |
+
16000
|
| 12 |
+
]
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"operation": {
|
| 18 |
+
"name": "phi_4_audio_embed",
|
| 19 |
+
"type": "Phi4AudioEmbed",
|
| 20 |
+
"attrs": {
|
| 21 |
+
"audio_compression_rate": 8,
|
| 22 |
+
"stft_normal/n_fft": 512,
|
| 23 |
+
"stft_normal/frame_length": 400,
|
| 24 |
+
"stft_normal/hop_length": 160,
|
| 25 |
+
"stft_normal/win_fn": "hamming",
|
| 26 |
+
"logmel/chunk_size": 30,
|
| 27 |
+
"logmel/hop_length": 160,
|
| 28 |
+
"logmel/n_fft": 512,
|
| 29 |
+
"logmel/n_mel": 80,
|
| 30 |
+
"logmel/feature_first": 0,
|
| 31 |
+
"logmel/no_padding": 1,
|
| 32 |
+
"stft_normal_8k/n_fft": 256,
|
| 33 |
+
"stft_normal_8k/frame_length": 200,
|
| 34 |
+
"stft_normal_8k/hop_length": 80,
|
| 35 |
+
"stft_normal_8k/win_fn": "hamming",
|
| 36 |
+
"logmel_8k/chunk_size": 30,
|
| 37 |
+
"logmel_8k/hop_length": 80,
|
| 38 |
+
"logmel_8k/n_fft": 512,
|
| 39 |
+
"logmel_8k/n_mel": 80,
|
| 40 |
+
"logmel_8k/feature_first": 0,
|
| 41 |
+
"logmel_8k/no_padding": 1
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"output_aligner": "phi4-audio-aligner"
|
| 47 |
+
}
|
| 48 |
+
}
|
vision_processor.json
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"processor": {
|
| 3 |
+
"name": "phi_4_vision_processor",
|
| 4 |
+
"transforms": [
|
| 5 |
+
{
|
| 6 |
+
"operation": {
|
| 7 |
+
"name": "decode_image",
|
| 8 |
+
"type": "DecodeImage",
|
| 9 |
+
"attrs": {
|
| 10 |
+
"color_space": "RGB"
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"operation": {
|
| 16 |
+
"name": "phi4_vision_dynamic_preprocess",
|
| 17 |
+
"type": "Phi4VisionDynamicPreprocess",
|
| 18 |
+
"attrs": {
|
| 19 |
+
"dynamic_hd": 36,
|
| 20 |
+
"dyhd_base_resolution": 448
|
| 21 |
+
}
|
| 22 |
+
}
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"operation": {
|
| 26 |
+
"name": "rescale",
|
| 27 |
+
"type": "Rescale",
|
| 28 |
+
"inputs": [
|
| 29 |
+
":0"
|
| 30 |
+
]
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"operation": {
|
| 35 |
+
"name": "normalize",
|
| 36 |
+
"type": "Normalize",
|
| 37 |
+
"attrs": {
|
| 38 |
+
"mean": [
|
| 39 |
+
0.5,
|
| 40 |
+
0.5,
|
| 41 |
+
0.5
|
| 42 |
+
],
|
| 43 |
+
"std": [
|
| 44 |
+
0.5,
|
| 45 |
+
0.5,
|
| 46 |
+
0.5
|
| 47 |
+
]
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"operation": {
|
| 53 |
+
"name": "phi4_vision_processor",
|
| 54 |
+
"type": "Phi4VisionProcessor",
|
| 55 |
+
"inputs": [
|
| 56 |
+
":0",
|
| 57 |
+
"phi4_vision_dynamic_preprocess:1"
|
| 58 |
+
],
|
| 59 |
+
"attrs": {
|
| 60 |
+
"dyhd_base_resolution": 448,
|
| 61 |
+
"interpolation": "CUBIC"
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"output_aligner": "phi4-vision-aligner"
|
| 67 |
+
}
|
| 68 |
+
}
|