Upload 3 files

Browse files

Files changed (3) hide show

genai_config.json +38 -0
speech_processor.json +48 -0
vision_processor.json +68 -0

genai_config.json CHANGED Viewed

@@ -25,6 +25,44 @@
             "num_hidden_layers": 32,
             "num_key_value_heads": 8
         },
         "eos_token_id": [
             200020,
             199999

             "num_hidden_layers": 32,
             "num_key_value_heads": 8
         },
+        "vision": {
+            "filename": "phi-4-mm-vision.onnx",
+            "config_filename": "vision_processor.json",
+            "adapter_filename": "phi-4-mm-vision.onnx_adapter",
+            "inputs": {
+                "pixel_values": "pixel_values",
+                "attention_mask": "image_attention_mask",
+                "image_sizes": "image_sizes"
+            },
+            "outputs": {
+                "image_features": "image_features"
+            }
+        },
+        "speech": {
+            "filename": "phi-4-mm-speech.onnx",
+            "config_filename": "speech_processor.json",
+            "adapter_filename": "phi-4-mm-speech.onnx_adapter",
+            "inputs": {
+                "audio_embeds": "audio_embeds",
+                "attention_mask": "audio_attention_mask",
+                "audio_sizes": "audio_sizes",
+                "audio_projection_mode": "audio_projection_mode"
+            },
+            "outputs": {
+                "audio_features": "audio_features"
+            }
+        },
+        "embedding": {
+            "filename": "phi-4-mm-embedding.onnx",
+            "inputs": {
+                "input_ids": "input_ids",
+                "image_features": "image_features",
+                "audio_features": "audio_features"
+            },
+            "outputs": {
+                "inputs_embeds": "inputs_embeds"
+            }
+        },
         "eos_token_id": [
             200020,
             199999

speech_processor.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "feature_extraction": {
+        "sequence": [
+            {
+                "operation": {
+                    "name": "audio_decoder",
+                    "type": "AudioDecoderEx",
+                    "attrs": {
+                        "target_sample_rates": [
+                            8000,
+                            16000
+                        ]
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi_4_audio_embed",
+                    "type": "Phi4AudioEmbed",
+                    "attrs": {
+                        "audio_compression_rate": 8,
+                        "stft_normal/n_fft": 512,
+                        "stft_normal/frame_length": 400,
+                        "stft_normal/hop_length": 160,
+                        "stft_normal/win_fn": "hamming",
+                        "logmel/chunk_size": 30,
+                        "logmel/hop_length": 160,
+                        "logmel/n_fft": 512,
+                        "logmel/n_mel": 80,
+                        "logmel/feature_first": 0,
+                        "logmel/no_padding": 1,
+                        "stft_normal_8k/n_fft": 256,
+                        "stft_normal_8k/frame_length": 200,
+                        "stft_normal_8k/hop_length": 80,
+                        "stft_normal_8k/win_fn": "hamming",
+                        "logmel_8k/chunk_size": 30,
+                        "logmel_8k/hop_length": 80,
+                        "logmel_8k/n_fft": 512,
+                        "logmel_8k/n_mel": 80,
+                        "logmel_8k/feature_first": 0,
+                        "logmel_8k/no_padding": 1
+                    }
+                }
+            }
+        ],
+        "output_aligner": "phi4-audio-aligner"
+    }
+}

vision_processor.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+    "processor": {
+        "name": "phi_4_vision_processor",
+        "transforms": [
+            {
+                "operation": {
+                    "name": "decode_image",
+                    "type": "DecodeImage",
+                    "attrs": {
+                        "color_space": "RGB"
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi4_vision_dynamic_preprocess",
+                    "type": "Phi4VisionDynamicPreprocess",
+                    "attrs": {
+                        "dynamic_hd": 36,
+                        "dyhd_base_resolution": 448
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "rescale",
+                    "type": "Rescale",
+                    "inputs": [
+                        ":0"
+                    ]
+                }
+            },
+            {
+                "operation": {
+                    "name": "normalize",
+                    "type": "Normalize",
+                    "attrs": {
+                        "mean": [
+                            0.5,
+                            0.5,
+                            0.5
+                        ],
+                        "std": [
+                            0.5,
+                            0.5,
+                            0.5
+                        ]
+                    }
+                }
+            },
+            {
+                "operation": {
+                    "name": "phi4_vision_processor",
+                    "type": "Phi4VisionProcessor",
+                    "inputs": [
+                        ":0",
+                        "phi4_vision_dynamic_preprocess:1"
+                    ],
+                    "attrs": {
+                        "dyhd_base_resolution": 448,
+                        "interpolation": "CUBIC"
+                    }
+                }
+            }
+        ],
+        "output_aligner": "phi4-vision-aligner"
+    }
+}