lokinfey commited on
Commit
53ae8ab
·
verified ·
1 Parent(s): 36f0aee

Upload 3 files

Browse files
Files changed (3) hide show
  1. genai_config.json +38 -0
  2. speech_processor.json +48 -0
  3. vision_processor.json +68 -0
genai_config.json CHANGED
@@ -25,6 +25,44 @@
25
  "num_hidden_layers": 32,
26
  "num_key_value_heads": 8
27
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "eos_token_id": [
29
  200020,
30
  199999
 
25
  "num_hidden_layers": 32,
26
  "num_key_value_heads": 8
27
  },
28
+ "vision": {
29
+ "filename": "phi-4-mm-vision.onnx",
30
+ "config_filename": "vision_processor.json",
31
+ "adapter_filename": "phi-4-mm-vision.onnx_adapter",
32
+ "inputs": {
33
+ "pixel_values": "pixel_values",
34
+ "attention_mask": "image_attention_mask",
35
+ "image_sizes": "image_sizes"
36
+ },
37
+ "outputs": {
38
+ "image_features": "image_features"
39
+ }
40
+ },
41
+ "speech": {
42
+ "filename": "phi-4-mm-speech.onnx",
43
+ "config_filename": "speech_processor.json",
44
+ "adapter_filename": "phi-4-mm-speech.onnx_adapter",
45
+ "inputs": {
46
+ "audio_embeds": "audio_embeds",
47
+ "attention_mask": "audio_attention_mask",
48
+ "audio_sizes": "audio_sizes",
49
+ "audio_projection_mode": "audio_projection_mode"
50
+ },
51
+ "outputs": {
52
+ "audio_features": "audio_features"
53
+ }
54
+ },
55
+ "embedding": {
56
+ "filename": "phi-4-mm-embedding.onnx",
57
+ "inputs": {
58
+ "input_ids": "input_ids",
59
+ "image_features": "image_features",
60
+ "audio_features": "audio_features"
61
+ },
62
+ "outputs": {
63
+ "inputs_embeds": "inputs_embeds"
64
+ }
65
+ },
66
  "eos_token_id": [
67
  200020,
68
  199999
speech_processor.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extraction": {
3
+ "sequence": [
4
+ {
5
+ "operation": {
6
+ "name": "audio_decoder",
7
+ "type": "AudioDecoderEx",
8
+ "attrs": {
9
+ "target_sample_rates": [
10
+ 8000,
11
+ 16000
12
+ ]
13
+ }
14
+ }
15
+ },
16
+ {
17
+ "operation": {
18
+ "name": "phi_4_audio_embed",
19
+ "type": "Phi4AudioEmbed",
20
+ "attrs": {
21
+ "audio_compression_rate": 8,
22
+ "stft_normal/n_fft": 512,
23
+ "stft_normal/frame_length": 400,
24
+ "stft_normal/hop_length": 160,
25
+ "stft_normal/win_fn": "hamming",
26
+ "logmel/chunk_size": 30,
27
+ "logmel/hop_length": 160,
28
+ "logmel/n_fft": 512,
29
+ "logmel/n_mel": 80,
30
+ "logmel/feature_first": 0,
31
+ "logmel/no_padding": 1,
32
+ "stft_normal_8k/n_fft": 256,
33
+ "stft_normal_8k/frame_length": 200,
34
+ "stft_normal_8k/hop_length": 80,
35
+ "stft_normal_8k/win_fn": "hamming",
36
+ "logmel_8k/chunk_size": 30,
37
+ "logmel_8k/hop_length": 80,
38
+ "logmel_8k/n_fft": 512,
39
+ "logmel_8k/n_mel": 80,
40
+ "logmel_8k/feature_first": 0,
41
+ "logmel_8k/no_padding": 1
42
+ }
43
+ }
44
+ }
45
+ ],
46
+ "output_aligner": "phi4-audio-aligner"
47
+ }
48
+ }
vision_processor.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor": {
3
+ "name": "phi_4_vision_processor",
4
+ "transforms": [
5
+ {
6
+ "operation": {
7
+ "name": "decode_image",
8
+ "type": "DecodeImage",
9
+ "attrs": {
10
+ "color_space": "RGB"
11
+ }
12
+ }
13
+ },
14
+ {
15
+ "operation": {
16
+ "name": "phi4_vision_dynamic_preprocess",
17
+ "type": "Phi4VisionDynamicPreprocess",
18
+ "attrs": {
19
+ "dynamic_hd": 36,
20
+ "dyhd_base_resolution": 448
21
+ }
22
+ }
23
+ },
24
+ {
25
+ "operation": {
26
+ "name": "rescale",
27
+ "type": "Rescale",
28
+ "inputs": [
29
+ ":0"
30
+ ]
31
+ }
32
+ },
33
+ {
34
+ "operation": {
35
+ "name": "normalize",
36
+ "type": "Normalize",
37
+ "attrs": {
38
+ "mean": [
39
+ 0.5,
40
+ 0.5,
41
+ 0.5
42
+ ],
43
+ "std": [
44
+ 0.5,
45
+ 0.5,
46
+ 0.5
47
+ ]
48
+ }
49
+ }
50
+ },
51
+ {
52
+ "operation": {
53
+ "name": "phi4_vision_processor",
54
+ "type": "Phi4VisionProcessor",
55
+ "inputs": [
56
+ ":0",
57
+ "phi4_vision_dynamic_preprocess:1"
58
+ ],
59
+ "attrs": {
60
+ "dyhd_base_resolution": 448,
61
+ "interpolation": "CUBIC"
62
+ }
63
+ }
64
+ }
65
+ ],
66
+ "output_aligner": "phi4-vision-aligner"
67
+ }
68
+ }