Spaces:
Running
Running
| model_config: | |
| model_name: HunyuanVideo-Foley-XXL | |
| model_type: 1d | |
| model_precision: bf16 | |
| model_kwargs: | |
| depth_triple_blocks: 18 | |
| depth_single_blocks: 36 | |
| hidden_size: 1536 | |
| num_heads: 12 | |
| mlp_ratio: 4 | |
| mlp_act_type: "gelu_tanh" | |
| qkv_bias: True | |
| qk_norm: True | |
| qk_norm_type: "rms" | |
| attn_mode: "torch" | |
| embedder_type: "default" | |
| interleaved_audio_visual_rope: True | |
| enable_learnable_empty_visual_feat: True | |
| sync_modulation: False | |
| add_sync_feat_to_audio: True | |
| cross_attention: True | |
| use_attention_mask: False | |
| condition_projection: "linear" | |
| sync_feat_dim: 768 # syncformer 768 dim | |
| condition_dim: 768 # clap 768 text condition dim (clip-text) | |
| clip_dim: 768 # siglip2 visual dim | |
| audio_vae_latent_dim: 128 | |
| audio_frame_rate: 50 | |
| patch_size: 1 | |
| rope_dim_list: null | |
| rope_theta: 10000 | |
| text_length: 77 | |
| clip_length: 64 | |
| sync_length: 192 | |
| use_mmaudio_singleblock: True | |
| depth_triple_ssl_encoder: null | |
| depth_single_ssl_encoder: 8 | |
| use_repa_with_audiossl: True | |
| diffusion_config: | |
| denoise_type: "flow" | |
| flow_path_type: "linear" | |
| flow_predict_type: "velocity" | |
| flow_reverse: True | |
| flow_solver: "euler" | |
| sample_flow_shift: 1.0 | |
| sample_use_flux_shift: False | |
| flux_base_shift: 0.5 | |
| flux_max_shift: 1.15 | |