model_config: model_name: HunyuanVideo-Foley-XXL model_type: 1d model_precision: bf16 model_kwargs: depth_triple_blocks: 18 depth_single_blocks: 36 hidden_size: 1536 num_heads: 12 mlp_ratio: 4 mlp_act_type: "gelu_tanh" qkv_bias: True qk_norm: True qk_norm_type: "rms" attn_mode: "torch" embedder_type: "default" interleaved_audio_visual_rope: True enable_learnable_empty_visual_feat: True sync_modulation: False add_sync_feat_to_audio: True cross_attention: True use_attention_mask: False condition_projection: "linear" sync_feat_dim: 768 # syncformer 768 dim condition_dim: 768 # clap 768 text condition dim (clip-text) clip_dim: 768 # siglip2 visual dim audio_vae_latent_dim: 128 audio_frame_rate: 50 patch_size: 1 rope_dim_list: null rope_theta: 10000 text_length: 77 clip_length: 64 sync_length: 192 use_mmaudio_singleblock: True depth_triple_ssl_encoder: null depth_single_ssl_encoder: 8 use_repa_with_audiossl: True diffusion_config: denoise_type: "flow" flow_path_type: "linear" flow_predict_type: "velocity" flow_reverse: True flow_solver: "euler" sample_flow_shift: 1.0 sample_use_flux_shift: False flux_base_shift: 0.5 flux_max_shift: 1.15