| { | |
| "_name_or_path": "_", | |
| "architectures": [ | |
| "OmniForCausalLM" | |
| ], | |
| "attention_qkv_bias": true, | |
| "attention_qkv_pack": true, | |
| "audio_config": { | |
| "audio_head_transformer_layers": 3, | |
| "audio_delim_token_id": 151693, | |
| "audio_end_token_id": 151677, | |
| "audio_pad_token_id": 151678, | |
| "audio_start_token_id": 151676, | |
| "audiogen_end_token_id": 151701, | |
| "audiogen_start_token_id": 151700, | |
| "audiotext_end_token_id": 151698, | |
| "audiotext_pad_token_id": 151699, | |
| "audiotext_start_token_id": 151697, | |
| "avg_pooler": 4, | |
| "d_model": 1280, | |
| "decoder_attention_heads": 20, | |
| "decoder_ffn_dim": 5120, | |
| "decoder_kernel_size": 3, | |
| "decoder_layers": 8, | |
| "decoder_stride_size": 2, | |
| "enable": true, | |
| "encoder_attention_heads": 20, | |
| "encoder_ffn_dim": 5120, | |
| "encoder_layers": 32, | |
| "hop_length": 160, | |
| "kernel_size": 3, | |
| "max_audio_seconds": 30, | |
| "n_fft": 400, | |
| "num_mel_bins": 128, | |
| "sampling_rate": 16000, | |
| "stride_size": 2, | |
| "split_overlap": 0.0, | |
| "vq_config":{ | |
| "enable": true, | |
| "codebook_sizes": [8192, 4096, 2048, 1024, 1024, 1024, 1024, 1024] | |
| } | |
| }, | |
| "auto_map": { | |
| "AutoConfig": "configuration_omni.OmniConfig", | |
| "AutoModelForCausalLM": "modeling_omni.OmniForCausalLM" | |
| }, | |
| "omni_tokenizer_type": "auto", | |
| "bos_token_id": 1, | |
| "eos_token_id": 2, | |
| "flow_matching_config": { | |
| "enable": true, | |
| "use_hires_mel": true, | |
| "sampling_rate": 24000, | |
| "hop_length": 480, | |
| "max_audio_seconds": 30, | |
| "split_overlap": 0.1, | |
| "use_hidden_states_before_dconv2": true, | |
| "prenet_in_dim": 1280, | |
| "prenet_out_dim": 80, | |
| "prenet_d_model": 512, | |
| "prenet_attention_heads": 8, | |
| "prenet_ffn_dim": 2048, | |
| "prenet_nlayers": 12, | |
| "prenet_activation_function": "gelu", | |
| "prenet_max_source_positions": 5000, | |
| "prenet_target_mel_length_scale_ratio": 1.0, | |
| "prenet_loss_weight": 1.0, | |
| "unet_use_omni_attn": false, | |
| "loss_weight": 1.0, | |
| "in_channels": 80, | |
| "spk_emb_dim": 0, | |
| "diffusion_steps": 10, | |
| "channels": [256], | |
| "dropout": 0.0, | |
| "attention_head_dim": 64, | |
| "n_blocks": 4, | |
| "num_mid_blocks": 12, | |
| "num_heads": 8, | |
| "act_fn": "gelu", | |
| "cal_mel_mae": true, | |
| "cfm_params": { | |
| "sigma_min": 1e-6, | |
| "solver": "euler", | |
| "t_scheduler": "cosine", | |
| "training_cfg_rate": 0.2, | |
| "inference_cfg_rate": 0.7, | |
| "reg_loss_type": "l1" | |
| } | |
| }, | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 3584, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 18944, | |
| "max_position_embeddings": 65536, | |
| "max_window_layers": 28, | |
| "model_type": "omni", | |
| "multimodal": [ | |
| "audio", | |
| "image", | |
| "video", | |
| "audiogen" | |
| ], | |
| "multimodal_special_token_list": [ | |
| 151676, | |
| 151677, | |
| 151678, | |
| 151679, | |
| 151680, | |
| 151681, | |
| 151682, | |
| 151683, | |
| 151684, | |
| 151685, | |
| 151686, | |
| 151687, | |
| 151688, | |
| 151693, | |
| 151694, | |
| 151695, | |
| 151696, | |
| 151697, | |
| 151698, | |
| 151699, | |
| 151700, | |
| 151701 | |
| ], | |
| "num_attention_heads": 28, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 4, | |
| "pad_token_id": 0, | |
| "position_embedding_type": "rope", | |
| "rms_norm_eps": 1e-06, | |
| "rope_theta": 1000000.0, | |
| "sliding_window": 131072, | |
| "sparse_attention_heads": null, | |
| "sparse_attention_layers": [], | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "bfloat16", | |
| "train_multimodal_special_tokens_only": false, | |
| "transformers_version": "4.45.0.dev0", | |
| "use_cache": false, | |
| "use_norm_head": false, | |
| "use_sliding_window": false, | |
| "video_config": { | |
| "_name_or_path": "", | |
| "_attn_implementation": "flash_attention_2", | |
| "decode_way": "1fps", | |
| "depth": 32, | |
| "embed_dim": 1280, | |
| "enable": true, | |
| "hidden_act": "quick_gelu", | |
| "hidden_size": 3584, | |
| "image_delimiter_token_id": 151688, | |
| "image_end_token_id": 151680, | |
| "image_line_token_id": 151682, | |
| "image_mean": [ | |
| 0.48145466, | |
| 0.4578275, | |
| 0.40821073 | |
| ], | |
| "image_pad_token_id": 151681, | |
| "image_size": 224, | |
| "image_start_token_id": 151679, | |
| "image_std": [ | |
| 0.26862954, | |
| 0.26130258, | |
| 0.27577711 | |
| ], | |
| "in_channels": 3, | |
| "in_chans": 3, | |
| "intermediate_size": 3072, | |
| "layer_norm_eps": 1e-05, | |
| "max_frame_num": 32, | |
| "max_length": 20, | |
| "max_pixels": 602112, | |
| "merge_size": 2, | |
| "min_length": 0, | |
| "min_pixels": 3136, | |
| "mlp_ratio": 4, | |
| "model_type": "clip_vision_model", | |
| "num_attention_heads": 12, | |
| "num_channels": 3, | |
| "num_heads": 16, | |
| "num_hidden_layers": 12, | |
| "patch_size": 14, | |
| "spatial_merge_size": 2, | |
| "spatial_patch_size": 14, | |
| "temporal_patch_size": 2, | |
| "video_end_token_id": 151696, | |
| "video_place_token_id": 151694, | |
| "video_start_token_id": 151695 | |
| }, | |
| "visual_config": { | |
| "_name_or_path": "", | |
| "_attn_implementation": "flash_attention_2", | |
| "depth": 32, | |
| "diversity_penalty": 0.0, | |
| "do_sample": false, | |
| "early_stopping": false, | |
| "embed_dim": 1280, | |
| "enable": true, | |
| "hidden_act": "quick_gelu", | |
| "hidden_size": 3584, | |
| "image_delimiter_token_id": 151688, | |
| "image_end_token_id": 151680, | |
| "image_line_token_id": 151682, | |
| "image_mean": [ | |
| 0.48145466, | |
| 0.4578275, | |
| 0.40821073 | |
| ], | |
| "image_pad_token_id": 151681, | |
| "image_size": 224, | |
| "image_start_token_id": 151679, | |
| "image_std": [ | |
| 0.26862954, | |
| 0.26130258, | |
| 0.27577711 | |
| ], | |
| "in_channels": 3, | |
| "in_chans": 3, | |
| "intermediate_size": 3072, | |
| "layer_norm_eps": 1e-05, | |
| "length_penalty": 1.0, | |
| "max_length": 20, | |
| "max_pixels": 3211264, | |
| "merge_size": 2, | |
| "min_length": 0, | |
| "min_pixels": 3136, | |
| "mlp_ratio": 4, | |
| "model_type": "clip_vision_model", | |
| "num_attention_heads": 12, | |
| "num_channels": 3, | |
| "num_heads": 16, | |
| "num_hidden_layers": 12, | |
| "patch_size": 14, | |
| "projection_dim": 512, | |
| "spatial_merge_size": 2, | |
| "spatial_patch_size": 14, | |
| "temporal_patch_size": 2 | |
| }, | |
| "vocab_size": 152064, | |
| "vocoder_config":{ | |
| "enable": true, | |
| "enable_multi_scale": true, | |
| "max_audio_seconds": 30, | |
| "sampling_rate": 16000, | |
| "hop_length": 256, | |
| "split_overlap": 0.0, | |
| "n_fft": 1024, | |
| "num_mel_bins": 80, | |
| "channels": [256, 256, 256, 256, 256] | |
| } | |
| } | |