Stand-In

Paused

App Files Files Community

fffiloni commited on Sep 1

Commit

26557da

verified ·

1 Parent(s): b2d49bb

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
LICENSE +201 -0
ORIGINAL_README.md +238 -0
app.py +163 -0
assets/Stand-In.png +0 -0
configs/model_config.py +1809 -0
data/video.py +158 -0
distributed/__init__.py +0 -0
distributed/xdit_context_parallel.py +154 -0
download_models.py +21 -0
infer.py +85 -0
infer_face_swap.py +119 -0
infer_with_lora.py +94 -0
infer_with_vace.py +106 -0
lora/__init__.py +91 -0
models/__init__.py +1 -0
models/attention.py +130 -0
models/downloader.py +122 -0
models/model_manager.py +610 -0
models/set_condition_branch.py +41 -0
models/tiler.py +333 -0
models/utils.py +219 -0
models/wan_video_camera_controller.py +290 -0
models/wan_video_dit.py +952 -0
models/wan_video_image_encoder.py +957 -0
models/wan_video_motion_controller.py +41 -0
models/wan_video_text_encoder.py +289 -0
models/wan_video_vace.py +140 -0
models/wan_video_vae.py +1634 -0
pipelines/base.py +173 -0
pipelines/wan_video.py +1793 -0
pipelines/wan_video_face_swap.py +1786 -0
preprocessor/__init__.py +2 -0
preprocessor/image_input_preprocessor.py +181 -0
preprocessor/videomask_generator.py +242 -0
prompters/__init__.py +3 -0
prompters/base_prompter.py +68 -0
prompters/omost.py +472 -0
prompters/prompt_refiners.py +131 -0
prompters/wan_prompter.py +112 -0
requirements.txt +17 -0
schedulers/__init__.py +3 -0
schedulers/continuous_ode.py +61 -0
schedulers/ddim.py +136 -0
schedulers/flow_match.py +100 -0
test/input/first_frame.png +3 -0
test/input/lecun.jpg +0 -0
test/input/pose.mp4 +3 -0
test/input/ruonan.jpg +3 -0
test/input/woman.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test/input/first_frame.png filter=lfs diff=lfs merge=lfs -text
+test/input/pose.mp4 filter=lfs diff=lfs merge=lfs -text
+test/input/ruonan.jpg filter=lfs diff=lfs merge=lfs -text
+test/input/woman.mp4 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,238 @@

+<div align="center">
+  <h1>
+    <img src="assets/Stand-In.png" width="85" alt="Logo" valign="middle">
+    Stand-In
+  </h1>
+  <h3>A Lightweight and Plug-and-Play Identity Control for Video Generation</h3>
+[![arXiv](https://img.shields.io/badge/arXiv-2508.07901-b31b1b)](https://arxiv.org/abs/2508.07901)
+[![Project Page](https://img.shields.io/badge/Project_Page-Link-green)](https://www.stand-in.tech)
+[![🤗 HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-orange)](https://huggingface.co/BowenXue/Stand-In)
+</div>
+<img width="5333" height="2983" alt="Image" src="https://github.com/user-attachments/assets/2fe1e505-bcf7-4eb6-8628-f23e70020966" />
+> **Stand-In** is a lightweight, plug-and-play framework for identity-preserving video generation. By training only **1%** additional parameters compared to the base video generation model, we achieve state-of-the-art results in both Face Similarity and Naturalness, outperforming various full-parameter training methods. Moreover, **Stand-In** can be seamlessly integrated into other tasks such as subject-driven video generation, pose-controlled video generation, video stylization, and face swapping.
+---
+## 🔥 News
+* **[2025.08.18]** We have released a version compatible with VACE. Not only pose control, but you can also try other control methods such as depth maps, combined with Stand-In to maintain identity simultaneously.
+* **[2025.08.16]** We have updated the experimental version of the face swapping feature. Feel free to try it out!
+* **[2025.08.13]** Special thanks to @kijai for integrating Stand-In into the custom ComfyUI node **WanVideoWrapper**. However, the implementation differs from the official version, which may affect Stand-In’s performance.
+   In order to address part of the issue, we have urgently released the official Stand-In preprocessing ComfyUI node:
+  👉 https://github.com/WeChatCV/Stand-In_Preprocessor_ComfyUI
+  If you wish to experience Stand-In within ComfyUI, please use **our official preprocessing node** to replace the one implemented by kijai.
+  For the best results, we recommend waiting for the release of our full **official Stand-In ComfyUI**.
+* **[2025.08.12]** Released Stand-In v1.0 (153M parameters), the Wan2.1-14B-T2V–adapted weights and inference code are now open-sourced.
+---
+## 🌟 Showcase
+### Identity-Preserving Text-to-Video Generation
+| Reference Image | Prompt | Generated Video |
+| :---: | :---: | :---: |
+|![Image](https://github.com/user-attachments/assets/86ce50d7-8ccb-45bf-9538-aea7f167a541)| "In a corridor where the walls ripple like water, a woman reaches out to touch the flowing surface, causing circles of ripples to spread. The camera moves from a medium shot to a close-up, capturing her curious expression as she sees her distorted reflection." |![Image](https://github.com/user-attachments/assets/c3c80bbf-a1cc-46a1-b47b-1b28bcad34a3) |
+|![Image](https://github.com/user-attachments/assets/de10285e-7983-42bb-8534-80ac02210172)| "A young man dressed in traditional attire draws the long sword from his waist and begins to wield it. The blade flashes with light as he moves—his eyes sharp, his actions swift and powerful, with his flowing robes dancing in the wind." |![Image](https://github.com/user-attachments/assets/1532c701-ef01-47be-86da-d33c8c6894ab)|
+---
+### Non-Human Subjects-Preserving Video Generation
+| Reference Image | Prompt | Generated Video |
+| :---: | :---: | :---: |
+|<img width="415" height="415" alt="Image" src="https://github.com/user-attachments/assets/b929444d-d724-4cf9-b422-be82b380ff78" />|"A chibi-style boy speeding on a skateboard, holding a detective novel in one hand. The background features city streets, with trees, streetlights, and billboards along the roads."|![Image](https://github.com/user-attachments/assets/a7239232-77bc-478b-a0d9-ecc77db97aa5) |
+---
+### Identity-Preserving Stylized Video Generation
+| Reference Image | LoRA | Generated Video |
+| :---: | :---: | :---: |
+|![Image](https://github.com/user-attachments/assets/9c0687f9-e465-4bc5-bc62-8ac46d5f38b1)|Ghibli LoRA|![Image](https://github.com/user-attachments/assets/c6ca1858-de39-4fff-825a-26e6d04e695f)|
+---
+### Video Face Swapping
+| Reference Video | Identity | Generated Video |
+| :---: | :---: | :---: |
+|![Image](https://github.com/user-attachments/assets/33370ac7-364a-4f97-8ba9-14e1009cd701)|<img width="415" height="415" alt="Image" src="https://github.com/user-attachments/assets/d2cd8da0-7aa0-4ee4-a61d-b52718c33756" />|![Image](https://github.com/user-attachments/assets/0db8aedd-411f-414a-9227-88f4e4050b50)|
+---
+### Pose-Guided Video Generation (With VACE)
+| Reference Pose | First Frame | Generated Video |
+| :---: | :---: | :---: |
+|![Image](https://github.com/user-attachments/assets/5df5eec8-b71c-4270-8a78-906a488f9a94)|<img width="719" height="415" alt="Image" src="https://github.com/user-attachments/assets/1c2a69e1-e530-4164-848b-e7ea85a99763" />|![Image](https://github.com/user-attachments/assets/1c8a54da-01d6-43c1-a5fd-cab0c9e32c44)|
+---
+### For more results, please visit [https://stand-in-video.github.io/](https://www.Stand-In.tech)
+## 📖 Key Features
+- Efficient Training: Only 1% of the base model parameters need to be trained.
+- High Fidelity: Outstanding identity consistency without sacrificing video generation quality.
+- Plug-and-Play: Easily integrates into existing T2V (Text-to-Video) models.
+- Highly Extensible: Compatible with community models such as LoRA, and supports various downstream video tasks.
+---
+## ✅ Todo List
+- [x] Release IP2V inference script (compatible with community LoRA).
+- [x] Open-source model weights compatible with Wan2.1-14B-T2V: `Stand-In_Wan2.1-T2V-14B_153M_v1.0`。
+- [ ] Open-source model weights compatible with Wan2.2-T2V-A14B.
+- [ ] Release training dataset, data preprocessing scripts, and training code.
+---
+## 🚀 Quick Start
+### 1. Environment Setup
+```bash
+# Clone the project repository
+git clone https://github.com/WeChatCV/Stand-In.git
+cd Stand-In
+# Create and activate Conda environment
+conda create -n Stand-In python=3.11 -y
+conda activate Stand-In
+# Install dependencies
+pip install -r requirements.txt
+# (Optional) Install Flash Attention for faster inference
+# Note: Make sure your GPU and CUDA version are compatible with Flash Attention
+pip install flash-attn --no-build-isolation
+```
+### 2. Model Download
+We provide an automatic download script that will fetch all required model weights into the  `checkpoints` directory.
+```bash
+python download_models.py
+```
+This script will download the following models:
+* `wan2.1-T2V-14B` (base text-to-video model)
+* `antelopev2` (face recognition model)
+* `Stand-In` (our Stand-In model)
+> Note: If you already have the `wan2.1-T2V-14B model` locally, you can manually edit the `download_model.py` script to comment out the relevant download code and place the model in the `checkpoints/wan2.1-T2V-14B` directory.
+---
+## 🧪 Usage
+### Standard Inference
+Use the `infer.py` script for standard identity-preserving text-to-video generation.
+```bash
+python infer.py \
+    --prompt "A man sits comfortably at a desk, facing the camera as if talking to a friend or family member on the screen. His gaze is focused and gentle, with a natural smile. The background is his carefully decorated personal space, with photos and a world map on the wall, conveying a sense of intimate and modern communication." \
+    --ip_image "test/input/lecun.jpg" \
+    --output "test/output/lecun.mp4"
+```
+**Prompt Writing Tip:** If you do not wish to alter the subject's facial features, simply use *"a man"* or *"a woman"* without adding extra descriptions of their appearance. Prompts support both Chinese and English input. The prompt is intended for generating frontal, medium-to-close-up videos.
+**Input Image Recommendation:** For best results, use a high-resolution frontal face image. There are no restrictions on resolution or file extension, as our built-in preprocessing pipeline will handle them automatically.
+---
+### Inference with Community LoRA
+Use the `infer_with_lora.py` script to load one or more community LoRA models alongside Stand-In.
+```bash
+python infer_with_lora.py \
+    --prompt "A man sits comfortably at a desk, facing the camera as if talking to a friend or family member on the screen. His gaze is focused and gentle, with a natural smile. The background is his carefully decorated personal space, with photos and a world map on the wall, conveying a sense of intimate and modern communication." \
+    --ip_image "test/input/lecun.jpg" \
+    --output "test/output/lecun.mp4" \
+    --lora_path "path/to/your/lora.safetensors" \
+    --lora_scale 1.0
+```
+We recommend using this stylization LoRA: [https://civitai.com/models/1404755/studio-ghibli-wan21-t2v-14b](https://civitai.com/models/1404755/studio-ghibli-wan21-t2v-14b)
+---
+### Video Face Swapping
+Use the `infer_face_swap.py` script to perform video face swapping with Stand-In.
+```bash
+python infer_face_swap.py \
+    --prompt "The video features a woman standing in front of a large screen displaying the words ""Tech Minute"" and the logo for CNET. She is wearing a purple top and appears to be presenting or speaking about technology-related topics. The background includes a cityscape with tall buildings, suggesting an urban setting. The woman seems to be engaged in a discussion or providing information on technology news or trends. The overall atmosphere is professional and informative, likely aimed at educating viewers about the latest developments in the tech industry." \
+    --ip_image "test/input/ruonan.jpg" \
+    --output "test/output/ruonan.mp4" \
+    --denoising_strength 0.85
+```
+**Note**: Since Wan2.1 itself does not have an inpainting function, our face swapping feature is still experimental.
+The higher the denoising_strength, the more the background area is redrawn, and the more natural the face area becomes. Conversely, the lower the denoising_strength, the less the background area is redrawn, and the higher the degree of overfitting in the face area.
+You can set --force_background_consistency to make the background completely consistent, but this may lead to potential and noticeable contour issues. Enabling this feature requires experimenting with different denoising_strength values to achieve the most natural effect. If slight changes to the background are not a concern, please do not enable this feature.
+### Infer with VACE
+Use the `infer_with_vace.py` script to perform identity-preserving video generation with Stand-In, compatible with VACE.
+```bash
+python infer_with_vace.py \
+    --prompt "A woman raises her hands." \
+    --vace_path "checkpoints/VACE/" \
+    --ip_image "test/input/first_frame.png" \
+    --reference_video "test/input/pose.mp4" \
+    --reference_image "test/input/first_frame.png" \
+    --output "test/output/woman.mp4" \
+    --vace_scale 0.8
+```
+You need to download the corresponding weights from the `VACE` repository or provide the path to the `VACE` weights in the `vace_path` parameter.
+```bash
+python download_models.py --vace
+```
+The input control video needs to be preprocessed using VACE's preprocessing tool. Both `reference_video` and `reference_image` are optional and can exist simultaneously. Additionally, VACE’s control has a preset bias towards faces, which affects identity preservation. Please lower the `vace_scale` to a balance point where both motion and identity are preserved. When only `ip_image` and `reference_video` are provided, the weight can be reduced to 0.5.
+Using both Stand-In and VACE together is more challenging than using Stand-In alone. We are still maintaining this feature, so if you encounter unexpected outputs or have other questions, feel free to raise them in the issue.
+## 🤝 Acknowledgements
+This project is built upon the following excellent open-source projects:
+* [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) (training/inference framework)
+* [Wan2.1](https://github.com/Wan-Video/Wan2.1) (base video generation model)
+We sincerely thank the authors and contributors of these projects.
+The original raw material of our dataset was collected with the help of our team member [Binxin Yang](https://binxinyang.github.io/), and we appreciate his contribution!
+---
+## ✏ Citation
+If you find our work helpful for your research, please consider citing our paper:
+```bibtex
+@article{xue2025standin,
+      title={Stand-In: A Lightweight and Plug-and-Play Identity Control for Video Generation},
+      author={Bowen Xue and Qixin Yan and Wenjing Wang and Hao Liu and Chen Li},
+      journal={arXiv preprint arXiv:2508.07901},
+      year={2025},
+}
+```
+---
+## 📬 Contact Us
+If you have any questions or suggestions, feel free to reach out via [GitHub Issues](https://github.com/WeChatCV/Stand-In/issues) . We look forward to your feedback!

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+import torch
+import time
+from PIL import Image
+import tempfile
+import os
+from data.video import save_video
+from wan_loader import load_wan_pipe
+from models.set_condition_branch import set_stand_in
+from preprocessor import FaceProcessor
+print("Loading model, please wait...")
+try:
+    ANTELOPEV2_PATH = "checkpoints/antelopev2"
+    BASE_MODEL_PATH = "checkpoints/base_model/"
+    LORA_MODEL_PATH = "checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt"
+    if not os.path.exists(ANTELOPEV2_PATH):
+        raise FileNotFoundError(
+            f"AntelopeV2 checkpoint not found at: {ANTELOPEV2_PATH}"
+        )
+    if not os.path.exists(BASE_MODEL_PATH):
+        raise FileNotFoundError(f"Base model not found at: {BASE_MODEL_PATH}")
+    if not os.path.exists(LORA_MODEL_PATH):
+        raise FileNotFoundError(f"LoRA model not found at: {LORA_MODEL_PATH}")
+    face_processor = FaceProcessor(antelopv2_path=ANTELOPEV2_PATH)
+    pipe = load_wan_pipe(base_path=BASE_MODEL_PATH, torch_dtype=torch.bfloat16)
+    set_stand_in(pipe, model_path=LORA_MODEL_PATH)
+    print("Model loaded successfully!")
+except Exception as e:
+    print(f"Model loading failed: {e}")
+    with gr.Blocks() as demo:
+        gr.Markdown("# Error: Model Loading Failed")
+        gr.Markdown(f"""
+        Please check the following:
+        1.  Make sure the checkpoint files are placed in the correct directory.
+        2.  Ensure all dependencies are properly installed.
+        3.  Check the console output for detailed error information.
+        **Error details**: {e}
+        """)
+    demo.launch()
+    exit()
+def generate_video(
+    pil_image: Image.Image,
+    prompt: str,
+    seed: int,
+    negative_prompt: str,
+    num_steps: int,
+    fps: int,
+    quality: int,
+):
+    if pil_image is None:
+        raise gr.Error("Please upload a face image first!")
+    print("Processing face...")
+    ip_image = face_processor.process(pil_image)
+    print("Face processing completed.")
+    print("Generating video...")
+    start_time = time.time()
+    video = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        seed=int(seed),
+        ip_image=ip_image,
+        num_inference_steps=int(num_steps),
+        tiled=False,
+    )
+    end_time = time.time()
+    print(f"Video generated in {end_time - start_time:.2f} seconds.")
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
+        video_path = temp_file.name
+        save_video(video, video_path, fps=int(fps), quality=quality)
+        print(f"Video saved to: {video_path}")
+        return video_path
+with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
+    gr.Markdown(
+        """
+        # Stand-In IP2V
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 1. Upload a Face Image")
+            input_image = gr.Image(
+                label="Upload Image",
+                type="pil",
+                image_mode="RGB",
+                height=300,
+            )
+            gr.Markdown("### 2. Enter Core Parameters")
+            input_prompt = gr.Textbox(
+                label="Prompt",
+                lines=4,
+                value="一位男性舒适地坐在书桌前，正对着镜头，仿佛在与屏幕前的亲友对话。他的眼神专注而温柔，嘴角带着自然的笑意。背景是他精心布置的个人空间，墙上贴着照片和一张世界地图，传达出一种亲密而现代的沟通感。",
+                placeholder="Please enter a detailed description of the scene, character actions, expressions, etc...",
+            )
+            input_seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=100000,
+                step=1,
+                value=0,
+                info="The same seed and parameters will generate the same result.",
+            )
+            with gr.Accordion("Advanced Options", open=False):
+                input_negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    lines=3,
+                    value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+                )
+                input_steps = gr.Slider(
+                    label="Inference Steps",
+                    minimum=10,
+                    maximum=50,
+                    step=1,
+                    value=20,
+                    info="More steps may improve details but will take longer to generate.",
+                )
+                output_fps = gr.Slider(
+                    label="Video FPS", minimum=10, maximum=30, step=1, value=25
+                )
+                output_quality = gr.Slider(
+                    label="Video Quality", minimum=1, maximum=10, step=1, value=9
+                )
+            generate_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column(scale=1):
+            gr.Markdown("### 3. View Generated Result")
+            output_video = gr.Video(
+                label="Generated Video",
+                height=480,
+            )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_image,
+            input_prompt,
+            input_seed,
+            input_negative_prompt,
+            input_steps,
+            output_fps,
+            output_quality,
+        ],
+        outputs=output_video,
+        api_name="generate_video",
+    )
+if __name__ == "__main__":
+    demo.launch(share=True, server_name="0.0.0.0", server_port=8080)

assets/Stand-In.png ADDED Viewed

configs/model_config.py ADDED Viewed

	@@ -0,0 +1,1809 @@

+from typing_extensions import Literal, TypeAlias
+from models.wan_video_dit import WanModel
+from models.wan_video_text_encoder import WanTextEncoder
+from models.wan_video_image_encoder import WanImageEncoder
+from models.wan_video_vae import WanVideoVAE, WanVideoVAE38
+from models.wan_video_motion_controller import WanMotionControllerModel
+from models.wan_video_vace import VaceWanModel
+model_loader_configs = [
+    (
+        None,
+        "9269f8db9040a9d860eaca435be61814",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "aafcfd9672c3a2456dc46e1cb6e52c70",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "6bfcfb3b342cb286ce886889d519a77e",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "6d6ccde6845b95ad9114ab993d917893",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "6bfcfb3b342cb286ce886889d519a77e",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "349723183fc063b2bfc10bb2835cf677",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "efa44cddf936c70abd0ea28b6cbe946c",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "3ef3b1f8e1dab83d5b71fd7b617f859f",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "70ddad9d3a133785da5ea371aae09504",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "26bde73488a92e64cc20b0a7485b9e5b",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "ac6a5aa74f4a0aab6f64eb9a72f19901",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "b61c605c2adbd23124d152ed28e049ae",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "1f5ab7703c6fc803fdded85ff040c316",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "5b013604280dd715f8457c6ed6d6a626",
+        ["wan_video_dit"],
+        [WanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "a61453409b67cd3246cf0c3bebad47ba",
+        ["wan_video_dit", "wan_video_vace"],
+        [WanModel, VaceWanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "7a513e1f257a861512b1afd387a8ecd9",
+        ["wan_video_dit", "wan_video_vace"],
+        [WanModel, VaceWanModel],
+        "civitai",
+    ),
+    (
+        None,
+        "cb104773c6c2cb6df4f9529ad5c60d0b",
+        ["wan_video_dit"],
+        [WanModel],
+        "diffusers",
+    ),
+    (
+        None,
+        "9c8818c2cbea55eca56c7b447df170da",
+        ["wan_video_text_encoder"],
+        [WanTextEncoder],
+        "civitai",
+    ),
+    (
+        None,
+        "5941c53e207d62f20f9025686193c40b",
+        ["wan_video_image_encoder"],
+        [WanImageEncoder],
+        "civitai",
+    ),
+    (
+        None,
+        "1378ea763357eea97acdef78e65d6d96",
+        ["wan_video_vae"],
+        [WanVideoVAE],
+        "civitai",
+    ),
+    (
+        None,
+        "ccc42284ea13e1ad04693284c7a09be6",
+        ["wan_video_vae"],
+        [WanVideoVAE],
+        "civitai",
+    ),
+    (
+        None,
+        "e1de6c02cdac79f8b739f4d3698cd216",
+        ["wan_video_vae"],
+        [WanVideoVAE38],
+        "civitai",
+    ),
+    (
+        None,
+        "dbd5ec76bbf977983f972c151d545389",
+        ["wan_video_motion_controller"],
+        [WanMotionControllerModel],
+        "civitai",
+    ),
+]
+huggingface_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (architecture_in_huggingface_config, huggingface_lib, model_name, redirected_architecture)
+    (
+        "ChatGLMModel",
+        "diffsynth.models.kolors_text_encoder",
+        "kolors_text_encoder",
+        None,
+    ),
+    ("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
+    (
+        "BloomForCausalLM",
+        "transformers.models.bloom.modeling_bloom",
+        "beautiful_prompt",
+        None,
+    ),
+    (
+        "Qwen2ForCausalLM",
+        "transformers.models.qwen2.modeling_qwen2",
+        "qwen_prompt",
+        None,
+    ),
+    # ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
+    (
+        "T5EncoderModel",
+        "diffsynth.models.flux_text_encoder",
+        "flux_text_encoder_2",
+        "FluxTextEncoder2",
+    ),
+    ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
+    (
+        "SiglipModel",
+        "transformers.models.siglip.modeling_siglip",
+        "siglip_vision_model",
+        "SiglipVisionModel",
+    ),
+    (
+        "LlamaForCausalLM",
+        "diffsynth.models.hunyuan_video_text_encoder",
+        "hunyuan_video_text_encoder_2",
+        "HunyuanVideoLLMEncoder",
+    ),
+    (
+        "LlavaForConditionalGeneration",
+        "diffsynth.models.hunyuan_video_text_encoder",
+        "hunyuan_video_text_encoder_2",
+        "HunyuanVideoMLLMEncoder",
+    ),
+    (
+        "Step1Model",
+        "diffsynth.models.stepvideo_text_encoder",
+        "stepvideo_text_encoder_2",
+        "STEP1TextEncoder",
+    ),
+    (
+        "Qwen2_5_VLForConditionalGeneration",
+        "diffsynth.models.qwenvl",
+        "qwenvl",
+        "Qwen25VL_7b_Embedder",
+    ),
+]
+patch_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash_with_shape, model_name, model_class, extra_kwargs)
+]
+preset_models_on_huggingface = {
+    "HunyuanDiT": [
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/clip_text_encoder/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/clip_text_encoder",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/mt5/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/mt5",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/model/pytorch_model_ema.pt",
+            "models/HunyuanDiT/t2i/model",
+        ),
+        (
+            "Tencent-Hunyuan/HunyuanDiT",
+            "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin",
+            "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix",
+        ),
+    ],
+    "stable-video-diffusion-img2vid-xt": [
+        (
+            "stabilityai/stable-video-diffusion-img2vid-xt",
+            "svd_xt.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    "ExVideo-SVD-128f-v1": [
+        (
+            "ECNU-CILab/ExVideo-SVD-128f-v1",
+            "model.fp16.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        (
+            "benjamin-paine/stable-diffusion-v1-5",
+            "v1-5-pruned-emaonly.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "DreamShaper_8": [
+        ("Yntec/Dreamshaper8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        (
+            "gemasai/verybadimagenegative_v1.3",
+            "verybadimagenegative_v1.3.pt",
+            "models/textual_inversion",
+        ),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        (
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            "sd_xl_base_1.0.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "BluePencilXL_v200": [
+        (
+            "frankjoshua/bluePencilXL_v200",
+            "bluePencilXL_v200.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "StableDiffusionXL_Turbo": [
+        (
+            "stabilityai/sdxl-turbo",
+            "sd_xl_turbo_1.0_fp16.safetensors",
+            "models/stable_diffusion_xl_turbo",
+        ),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        (
+            "stabilityai/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips_t5xxlfp16.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    "StableDiffusion3_without_T5": [
+        (
+            "stabilityai/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11f1p_sd15_depth.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11p_sd15_softedge.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11f1e_sd15_tile.pth",
+            "models/ControlNet",
+        )
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        (
+            "lllyasviel/ControlNet-v1-1",
+            "control_v11p_sd15_lineart.pth",
+            "models/ControlNet",
+        ),
+        ("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "ControlNet_union_sdxl_promax": [
+        (
+            "xinsir/controlnet-union-sdxl-1.0",
+            "diffusion_pytorch_model_promax.safetensors",
+            "models/ControlNet/controlnet_union",
+        ),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("guoyww/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("guoyww/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": [
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "generation_config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "model.safetensors",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "special_tokens_map.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "tokenizer.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "tokenizer_config.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "merges.txt",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+        (
+            "Qwen/Qwen2-1.5B-Instruct",
+            "vocab.json",
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "generation_config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "model.safetensors",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "special_tokens_map.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "tokenizer.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+        (
+            "alibaba-pai/pai-bloom-1b1-text2prompt-sd",
+            "tokenizer_config.json",
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ),
+    ],
+    # Omost prompt
+    "OmostPrompt": [
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model-00001-of-00002.safetensors",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model-00002-of-00002.safetensors",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "tokenizer.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "tokenizer_config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "generation_config.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "model.safetensors.index.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+        (
+            "lllyasviel/omost-llama-3-8b-4bits",
+            "special_tokens_map.json",
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "generation_config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "metadata.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "pytorch_model.bin",
+            "models/translator/opus-mt-zh-en",
+        ),
+        ("Helsinki-NLP/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        (
+            "Helsinki-NLP/opus-mt-zh-en",
+            "tokenizer_config.json",
+            "models/translator/opus-mt-zh-en",
+        ),
+        ("Helsinki-NLP/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        (
+            "h94/IP-Adapter",
+            "models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion/image_encoder",
+        ),
+        (
+            "h94/IP-Adapter",
+            "models/ip-adapter_sd15.bin",
+            "models/IpAdapter/stable_diffusion",
+        ),
+    ],
+    "IP-Adapter-SDXL": [
+        (
+            "h94/IP-Adapter",
+            "sdxl_models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion_xl/image_encoder",
+        ),
+        (
+            "h94/IP-Adapter",
+            "sdxl_models/ip-adapter_sdxl.bin",
+            "models/IpAdapter/stable_diffusion_xl",
+        ),
+    ],
+    "SDXL-vae-fp16-fix": [
+        (
+            "madebyollin/sdxl-vae-fp16-fix",
+            "diffusion_pytorch_model.safetensors",
+            "models/sdxl-vae-fp16-fix",
+        )
+    ],
+    # Kolors
+    "Kolors": [
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/config.json",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model.bin.index.json",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00001-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00002-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00003-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00004-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00005-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00006-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "text_encoder/pytorch_model-00007-of-00007.bin",
+            "models/kolors/Kolors/text_encoder",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/unet",
+        ),
+        (
+            "Kwai-Kolors/Kolors",
+            "vae/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae",
+        ),
+    ],
+    # FLUX
+    "FLUX.1-dev": [
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/config.json",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model-00001-of-00002.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model-00002-of-00002.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "text_encoder_2/model.safetensors.index.json",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+        ),
+        ("black-forest-labs/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+        (
+            "black-forest-labs/FLUX.1-dev",
+            "flux1-dev.safetensors",
+            "models/FLUX/FLUX.1-dev",
+        ),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            (
+                "InstantX/FLUX.1-dev-IP-Adapter",
+                "ip-adapter.bin",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter",
+            ),
+            (
+                "google/siglip-so400m-patch14-384",
+                "model.safetensors",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+            (
+                "google/siglip-so400m-patch14-384",
+                "config.json",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # RIFE
+    "RIFE": [
+        ("AlexWortega/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # CogVideo
+    "CogVideoX-5B": [
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/config.json",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model.safetensors.index.json",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model-00001-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "text_encoder/model-00002-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/config.json",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model.safetensors.index.json",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model-00001-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "transformer/diffusion_pytorch_model-00002-of-00002.safetensors",
+            "models/CogVideo/CogVideoX-5b/transformer",
+        ),
+        (
+            "THUDM/CogVideoX-5b",
+            "vae/diffusion_pytorch_model.safetensors",
+            "models/CogVideo/CogVideoX-5b/vae",
+        ),
+    ],
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "sd3.5_large.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "stabilityai/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+}
+preset_models_on_modelscope = {
+    # Hunyuan DiT
+    "HunyuanDiT": [
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/clip_text_encoder/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/clip_text_encoder",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/mt5/pytorch_model.bin",
+            "models/HunyuanDiT/t2i/mt5",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/model/pytorch_model_ema.pt",
+            "models/HunyuanDiT/t2i/model",
+        ),
+        (
+            "modelscope/HunyuanDiT",
+            "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin",
+            "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix",
+        ),
+    ],
+    # Stable Video Diffusion
+    "stable-video-diffusion-img2vid-xt": [
+        (
+            "AI-ModelScope/stable-video-diffusion-img2vid-xt",
+            "svd_xt.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    # ExVideo
+    "ExVideo-SVD-128f-v1": [
+        (
+            "ECNU-CILab/ExVideo-SVD-128f-v1",
+            "model.fp16.safetensors",
+            "models/stable_video_diffusion",
+        ),
+    ],
+    "ExVideo-CogVideoX-LoRA-129f-v1": [
+        (
+            "ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1",
+            "ExVideo-CogVideoX-LoRA-129f-v1.safetensors",
+            "models/lora",
+        ),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        (
+            "AI-ModelScope/stable-diffusion-v1-5",
+            "v1-5-pruned-emaonly.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "DreamShaper_8": [
+        (
+            "sd_lora/dreamshaper_8",
+            "dreamshaper_8.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "AingDiffusion_v12": [
+        (
+            "sd_lora/aingdiffusion_v12",
+            "aingdiffusion_v12.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        (
+            "sd_lora/Flat-2D-Animerge",
+            "flat2DAnimerge_v45Sharp.safetensors",
+            "models/stable_diffusion",
+        ),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        (
+            "sd_lora/verybadimagenegative_v1.3",
+            "verybadimagenegative_v1.3.pt",
+            "models/textual_inversion",
+        ),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        (
+            "AI-ModelScope/stable-diffusion-xl-base-1.0",
+            "sd_xl_base_1.0.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "BluePencilXL_v200": [
+        (
+            "sd_lora/bluePencilXL_v200",
+            "bluePencilXL_v200.safetensors",
+            "models/stable_diffusion_xl",
+        ),
+    ],
+    "StableDiffusionXL_Turbo": [
+        (
+            "AI-ModelScope/sdxl-turbo",
+            "sd_xl_turbo_1.0_fp16.safetensors",
+            "models/stable_diffusion_xl_turbo",
+        ),
+    ],
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0": [
+        (
+            "sd_lora/zyd232_ChineseInkStyle_SDXL_v1_0",
+            "zyd232_ChineseInkStyle_SDXL_v1_0.safetensors",
+            "models/lora",
+        ),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        (
+            "AI-ModelScope/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips_t5xxlfp16.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    "StableDiffusion3_without_T5": [
+        (
+            "AI-ModelScope/stable-diffusion-3-medium",
+            "sd3_medium_incl_clips.safetensors",
+            "models/stable_diffusion_3",
+        ),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11f1p_sd15_depth.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11p_sd15_softedge.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11f1e_sd15_tile.pth",
+            "models/ControlNet",
+        )
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        (
+            "AI-ModelScope/ControlNet-v1-1",
+            "control_v11p_sd15_lineart.pth",
+            "models/ControlNet",
+        ),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "ControlNet_union_sdxl_promax": [
+        (
+            "AI-ModelScope/controlnet-union-sdxl-1.0",
+            "diffusion_pytorch_model_promax.safetensors",
+            "models/ControlNet/controlnet_union",
+        ),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Depth": [
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Softedge": [
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "Annotators:Lineart": [
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "Annotators:Normal": [
+        ("sd_lora/Annotators", "scannet.pt", "models/Annotators"),
+    ],
+    "Annotators:Openpose": [
+        ("sd_lora/Annotators", "body_pose_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "facenet.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "hand_pose_model.pth", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        (
+            "Shanghai_AI_Laboratory/animatediff",
+            "mm_sd_v15_v2.ckpt",
+            "models/AnimateDiff",
+        ),
+    ],
+    "AnimateDiff_xl_beta": [
+        (
+            "Shanghai_AI_Laboratory/animatediff",
+            "mm_sdxl_v10_beta.ckpt",
+            "models/AnimateDiff",
+        ),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": {
+        "file_list": [
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "generation_config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "model.safetensors",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "special_tokens_map.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "tokenizer.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "tokenizer_config.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "merges.txt",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+            (
+                "qwen/Qwen2-1.5B-Instruct",
+                "vocab.json",
+                "models/QwenPrompt/qwen2-1.5b-instruct",
+            ),
+        ],
+        "load_path": [
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ],
+    },
+    # Beautiful Prompt
+    "BeautifulPrompt": {
+        "file_list": [
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "generation_config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "model.safetensors",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "special_tokens_map.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "tokenizer.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+            (
+                "AI-ModelScope/pai-bloom-1b1-text2prompt-sd",
+                "tokenizer_config.json",
+                "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+            ),
+        ],
+        "load_path": [
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ],
+    },
+    # Omost prompt
+    "OmostPrompt": {
+        "file_list": [
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model-00001-of-00002.safetensors",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model-00002-of-00002.safetensors",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "tokenizer.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "tokenizer_config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "generation_config.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "model.safetensors.index.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+            (
+                "Omost/omost-llama-3-8b-4bits",
+                "special_tokens_map.json",
+                "models/OmostPrompt/omost-llama-3-8b-4bits",
+            ),
+        ],
+        "load_path": [
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ],
+    },
+    # Translator
+    "opus-mt-zh-en": {
+        "file_list": [
+            ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+            (
+                "moxying/opus-mt-zh-en",
+                "generation_config.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            (
+                "moxying/opus-mt-zh-en",
+                "metadata.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            (
+                "moxying/opus-mt-zh-en",
+                "pytorch_model.bin",
+                "models/translator/opus-mt-zh-en",
+            ),
+            ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+            (
+                "moxying/opus-mt-zh-en",
+                "tokenizer_config.json",
+                "models/translator/opus-mt-zh-en",
+            ),
+            ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+        ],
+        "load_path": [
+            "models/translator/opus-mt-zh-en",
+        ],
+    },
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        (
+            "AI-ModelScope/IP-Adapter",
+            "models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion/image_encoder",
+        ),
+        (
+            "AI-ModelScope/IP-Adapter",
+            "models/ip-adapter_sd15.bin",
+            "models/IpAdapter/stable_diffusion",
+        ),
+    ],
+    "IP-Adapter-SDXL": [
+        (
+            "AI-ModelScope/IP-Adapter",
+            "sdxl_models/image_encoder/model.safetensors",
+            "models/IpAdapter/stable_diffusion_xl/image_encoder",
+        ),
+        (
+            "AI-ModelScope/IP-Adapter",
+            "sdxl_models/ip-adapter_sdxl.bin",
+            "models/IpAdapter/stable_diffusion_xl",
+        ),
+    ],
+    # Kolors
+    "Kolors": {
+        "file_list": [
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/config.json",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model.bin.index.json",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00001-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00002-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00003-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00004-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00005-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00006-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "text_encoder/pytorch_model-00007-of-00007.bin",
+                "models/kolors/Kolors/text_encoder",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "unet/diffusion_pytorch_model.safetensors",
+                "models/kolors/Kolors/unet",
+            ),
+            (
+                "Kwai-Kolors/Kolors",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/kolors/Kolors/vae",
+            ),
+        ],
+        "load_path": [
+            "models/kolors/Kolors/text_encoder",
+            "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    "SDXL-vae-fp16-fix": [
+        (
+            "AI-ModelScope/sdxl-vae-fp16-fix",
+            "diffusion_pytorch_model.safetensors",
+            "models/sdxl-vae-fp16-fix",
+        )
+    ],
+    # FLUX
+    "FLUX.1-dev": {
+        "file_list": [
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder/model.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/config.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00001-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00002-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model.safetensors.index.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "flux1-dev.safetensors",
+                "models/FLUX/FLUX.1-dev",
+            ),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-dev/flux1-dev.safetensors",
+        ],
+    },
+    "FLUX.1-schnell": {
+        "file_list": [
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder/model.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/config.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00001-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model-00002-of-00002.safetensors",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/FLUX.1-dev",
+                "text_encoder_2/model.safetensors.index.json",
+                "models/FLUX/FLUX.1-dev/text_encoder_2",
+            ),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            (
+                "AI-ModelScope/FLUX.1-schnell",
+                "flux1-schnell.safetensors",
+                "models/FLUX/FLUX.1-schnell",
+            ),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors",
+        ],
+    },
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
+        (
+            "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Depth": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Depth",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+        ),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Upscaler": [
+        (
+            "jasperai/Flux.1-dev-Controlnet-Upscaler",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler",
+        ),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
+        (
+            "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+        ),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
+        (
+            "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+        ),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
+        (
+            "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+        ),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
+        (
+            "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+            "diffusion_pytorch_model.safetensors",
+            "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+        ),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            (
+                "InstantX/FLUX.1-dev-IP-Adapter",
+                "ip-adapter.bin",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter",
+            ),
+            (
+                "AI-ModelScope/siglip-so400m-patch14-384",
+                "model.safetensors",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+            (
+                "AI-ModelScope/siglip-so400m-patch14-384",
+                "config.json",
+                "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+            ),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    "InfiniteYou": {
+        "file_list": [
+            (
+                "ByteDance/InfiniteYou",
+                "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
+                "models/InfiniteYou/InfuseNetModel",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors",
+                "models/InfiniteYou/InfuseNetModel",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "infu_flux_v1.0/aes_stage2/image_proj_model.bin",
+                "models/InfiniteYou",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "supports/insightface/models/antelopev2/1k3d68.onnx",
+                "models/InfiniteYou/insightface/models/antelopev2",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "supports/insightface/models/antelopev2/2d106det.onnx",
+                "models/InfiniteYou/insightface/models/antelopev2",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "supports/insightface/models/antelopev2/genderage.onnx",
+                "models/InfiniteYou/insightface/models/antelopev2",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "supports/insightface/models/antelopev2/glintr100.onnx",
+                "models/InfiniteYou/insightface/models/antelopev2",
+            ),
+            (
+                "ByteDance/InfiniteYou",
+                "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx",
+                "models/InfiniteYou/insightface/models/antelopev2",
+            ),
+        ],
+        "load_path": [
+            [
+                "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
+                "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors",
+            ],
+            "models/InfiniteYou/image_proj_model.bin",
+        ],
+    },
+    # ESRGAN
+    "ESRGAN_x4": [
+        ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("AI-ModelScope/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # Omnigen
+    "OmniGen-v1": {
+        "file_list": [
+            (
+                "BAAI/OmniGen-v1",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/OmniGen/OmniGen-v1/vae",
+            ),
+            ("BAAI/OmniGen-v1", "model.safetensors", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "special_tokens_map.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer_config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer.json", "models/OmniGen/OmniGen-v1"),
+        ],
+        "load_path": [
+            "models/OmniGen/OmniGen-v1/vae/diffusion_pytorch_model.safetensors",
+            "models/OmniGen/OmniGen-v1/model.safetensors",
+        ],
+    },
+    # CogVideo
+    "CogVideoX-5B": {
+        "file_list": [
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/config.json",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model.safetensors.index.json",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model-00001-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "text_encoder/model-00002-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/text_encoder",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/config.json",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model.safetensors.index.json",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model-00001-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "transformer/diffusion_pytorch_model-00002-of-00002.safetensors",
+                "models/CogVideo/CogVideoX-5b/transformer",
+            ),
+            (
+                "ZhipuAI/CogVideoX-5b",
+                "vae/diffusion_pytorch_model.safetensors",
+                "models/CogVideo/CogVideoX-5b/vae",
+            ),
+        ],
+        "load_path": [
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+            "models/CogVideo/CogVideoX-5b/transformer",
+            "models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "sd3.5_large.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "StableDiffusion3.5-medium": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-medium",
+            "sd3.5_medium.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "StableDiffusion3.5-large-turbo": [
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large-turbo",
+            "sd3.5_large_turbo.safetensors",
+            "models/stable_diffusion_3",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_l.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/clip_g.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+        (
+            "AI-ModelScope/stable-diffusion-3.5-large",
+            "text_encoders/t5xxl_fp16.safetensors",
+            "models/stable_diffusion_3/text_encoders",
+        ),
+    ],
+    "HunyuanVideo": {
+        "file_list": [
+            (
+                "AI-ModelScope/clip-vit-large-patch14",
+                "model.safetensors",
+                "models/HunyuanVideo/text_encoder",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00001-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00002-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00003-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00004-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "config.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model.safetensors.index.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
+                "models/HunyuanVideo/vae",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
+                "models/HunyuanVideo/transformers",
+            ),
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt",
+        ],
+    },
+    "HunyuanVideoI2V": {
+        "file_list": [
+            (
+                "AI-ModelScope/clip-vit-large-patch14",
+                "model.safetensors",
+                "models/HunyuanVideoI2V/text_encoder",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "model-00001-of-00004.safetensors",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "model-00002-of-00004.safetensors",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "model-00003-of-00004.safetensors",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "model-00004-of-00004.safetensors",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "config.json",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/llava-llama-3-8b-v1_1-transformers",
+                "model.safetensors.index.json",
+                "models/HunyuanVideoI2V/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo-I2V",
+                "hunyuan-video-i2v-720p/vae/pytorch_model.pt",
+                "models/HunyuanVideoI2V/vae",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo-I2V",
+                "hunyuan-video-i2v-720p/transformers/mp_rank_00_model_states.pt",
+                "models/HunyuanVideoI2V/transformers",
+            ),
+        ],
+        "load_path": [
+            "models/HunyuanVideoI2V/text_encoder/model.safetensors",
+            "models/HunyuanVideoI2V/text_encoder_2",
+            "models/HunyuanVideoI2V/vae/pytorch_model.pt",
+            "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt",
+        ],
+    },
+    "HunyuanVideo-fp8": {
+        "file_list": [
+            (
+                "AI-ModelScope/clip-vit-large-patch14",
+                "model.safetensors",
+                "models/HunyuanVideo/text_encoder",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00001-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00002-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00003-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model-00004-of-00004.safetensors",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "config.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder",
+                "model.safetensors.index.json",
+                "models/HunyuanVideo/text_encoder_2",
+            ),
+            (
+                "AI-ModelScope/HunyuanVideo",
+                "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
+                "models/HunyuanVideo/vae",
+            ),
+            (
+                "DiffSynth-Studio/HunyuanVideo-safetensors",
+                "model.fp8.safetensors",
+                "models/HunyuanVideo/transformers",
+            ),
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/model.fp8.safetensors",
+        ],
+    },
+}
+Preset_model_id: TypeAlias = Literal[
+    "HunyuanDiT",
+    "stable-video-diffusion-img2vid-xt",
+    "ExVideo-SVD-128f-v1",
+    "ExVideo-CogVideoX-LoRA-129f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
+    "StableDiffusion3",
+    "StableDiffusion3_without_T5",
+    "Kolors",
+    "SDXL-vae-fp16-fix",
+    "ControlNet_union_sdxl_promax",
+    "FLUX.1-dev",
+    "FLUX.1-schnell",
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+    "jasperai/Flux.1-dev-Controlnet-Depth",
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+    "jasperai/Flux.1-dev-Controlnet-Upscaler",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+    "InstantX/FLUX.1-dev-IP-Adapter",
+    "InfiniteYou",
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
+    "QwenPrompt",
+    "OmostPrompt",
+    "ESRGAN_x4",
+    "RIFE",
+    "OmniGen-v1",
+    "CogVideoX-5B",
+    "Annotators:Depth",
+    "Annotators:Softedge",
+    "Annotators:Lineart",
+    "Annotators:Normal",
+    "Annotators:Openpose",
+    "StableDiffusion3.5-large",
+    "StableDiffusion3.5-medium",
+    "HunyuanVideo",
+    "HunyuanVideo-fp8",
+    "HunyuanVideoI2V",
+]

data/video.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import imageio, os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i) >= ord("0") and ord(i) <= ord("9"):
+            if number == -1:
+                number = 0
+            number = number * 10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [
+        i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")
+    ]
+    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [
+                os.path.join(folder, file_name) for file_name in file_list
+            ]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return Image.open(self.file_list[item]).convert("RGB")
+    def __del__(self):
+        pass
+def crop_and_resize(image, height, width):
+    image = np.array(image)
+    image_height, image_width, _ = image.shape
+    if image_height / image_width < height / width:
+        croped_width = int(image_height / height * width)
+        left = (image_width - croped_width) // 2
+        image = image[:, left : left + croped_width]
+        image = Image.fromarray(image).resize((width, height))
+    else:
+        croped_height = int(image_width / width * height)
+        left = (image_height - croped_height) // 2
+        image = image[left : left + croped_height, :]
+        image = Image.fromarray(image).resize((width, height))
+    return image
+class VideoData:
+    def __init__(
+        self, video_file=None, image_folder=None, height=None, width=None, **kwargs
+    ):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.set_shape(height, width)
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        width, height = frame.size
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = crop_and_resize(frame, self.height, self.width)
+        return frame
+    def __del__(self):
+        pass
+    def save_images(self, folder):
+        os.makedirs(folder, exist_ok=True)
+        for i in tqdm(range(self.__len__()), desc="Saving images"):
+            frame = self.__getitem__(i)
+            frame.save(os.path.join(folder, f"{i}.png"))
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    )
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def save_frames(frames, save_path):
+    os.makedirs(save_path, exist_ok=True)
+    for i, frame in enumerate(tqdm(frames, desc="Saving images")):
+        frame.save(os.path.join(save_path, f"{i}.png"))

distributed/__init__.py ADDED Viewed

File without changes

distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+from typing import Optional
+from einops import rearrange
+from xfuser.core.distributed import (
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_sp_group,
+)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(
+        position.type(torch.float64),
+        torch.pow(
+            10000,
+            -torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(
+                dim // 2
+            ),
+        ),
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size, s1, s2, dtype=original_tensor.dtype, device=original_tensor.device
+    )
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    s_per_rank = x.shape[1]
+    x_out = torch.view_as_complex(
+        x.to(torch.float64).reshape(x.shape[0], x.shape[1], x.shape[2], -1, 2)
+    )
+    sp_size = get_sequence_parallel_world_size()
+    sp_rank = get_sequence_parallel_rank()
+    freqs = pad_freqs(freqs, s_per_rank * sp_size)
+    freqs_rank = freqs[(sp_rank * s_per_rank) : ((sp_rank + 1) * s_per_rank), :, :]
+    x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
+    return x_out.to(x.dtype)
+def usp_dit_forward(
+    self,
+    x: torch.Tensor,
+    timestep: torch.Tensor,
+    context: torch.Tensor,
+    clip_feature: Optional[torch.Tensor] = None,
+    y: Optional[torch.Tensor] = None,
+    use_gradient_checkpointing: bool = False,
+    use_gradient_checkpointing_offload: bool = False,
+    **kwargs,
+):
+    t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
+    t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+    context = self.text_embedding(context)
+    if self.has_image_input:
+        x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+        clip_embdding = self.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    x, (f, h, w) = self.patchify(x)
+    freqs = (
+        torch.cat(
+            [
+                self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        )
+        .reshape(f * h * w, 1, -1)
+        .to(x.device)
+    )
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+    # Context Parallel
+    x = torch.chunk(x, get_sequence_parallel_world_size(), dim=1)[
+        get_sequence_parallel_rank()
+    ]
+    for block in self.blocks:
+        if self.training and use_gradient_checkpointing:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        use_reentrant=False,
+                    )
+            else:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    context,
+                    t_mod,
+                    freqs,
+                    use_reentrant=False,
+                )
+        else:
+            x = block(x, context, t_mod, freqs)
+    x = self.head(x, t)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, (f, h, w))
+    return x
+def usp_attn_forward(self, x, freqs):
+    q = self.norm_q(self.q(x))
+    k = self.norm_k(self.k(x))
+    v = self.v(x)
+    q = rope_apply(q, freqs, self.num_heads)
+    k = rope_apply(k, freqs, self.num_heads)
+    q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
+    k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
+    v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
+    x = xFuserLongContextAttention()(
+        None,
+        query=q,
+        key=k,
+        value=v,
+    )
+    x = x.flatten(2)
+    del q, k, v
+    torch.cuda.empty_cache()
+    return self.o(x)

download_models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import argparse
+from huggingface_hub import snapshot_download
+def main(use_vace: bool):
+    if use_vace:
+        snapshot_download("Wan-AI/Wan2.1-VACE-14B", local_dir="checkpoints/VACE/")
+    else:
+        snapshot_download("Wan-AI/Wan2.1-T2V-14B", local_dir="checkpoints/base_model/")
+    snapshot_download(
+        "DIAMONIK7777/antelopev2",
+        local_dir="checkpoints/antelopev2/models/antelopev2"
+    )
+    snapshot_download("BowenXue/Stand-In", local_dir="checkpoints/Stand-In/")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download models with or without VACE.")
+    parser.add_argument("--vace", action="store_true", help="Use VACE model instead of T2V.")
+    args = parser.parse_args()
+    main(args.vace)

infer.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+from data.video import save_video
+from wan_loader import load_wan_pipe
+from models.set_condition_branch import set_stand_in
+from preprocessor import FaceProcessor
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--ip_image",
+    type=str,
+    default="test/input/lecun.jpg",
+    help="Input face image path or URL",
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="一位男性舒适地坐在书桌前，正对着镜头，仿佛在与屏幕前的亲友对话。他的眼神专注而温柔，嘴角带着自然的笑意。背景是他精心布置的个人空间，墙上贴着照片和一张世界地图，传达出一种亲密而现代的沟通感。",
+    help="Text prompt for video generation",
+)
+parser.add_argument(
+    "--output", type=str, default="test/output/lecun.mp4", help="Output video file path"
+)
+parser.add_argument(
+    "--seed", type=int, default=0, help="Random seed for reproducibility"
+)
+parser.add_argument(
+    "--num_inference_steps", type=int, default=20, help="Number of inference steps"
+)
+parser.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    help="Negative prompt to avoid unwanted features",
+)
+parser.add_argument("--tiled", action="store_true", help="Enable tiled mode")
+parser.add_argument(
+    "--fps", type=int, default=25, help="Frames per second for output video"
+)
+parser.add_argument(
+    "--quality", type=int, default=9, help="Output video quality (1-9)"
+)
+parser.add_argument(
+    "--base_path",
+    type=str,
+    default="checkpoints/base_model/",
+    help="Path to base model checkpoint",
+)
+parser.add_argument(
+    "--stand_in_path",
+    type=str,
+    default="checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt",
+    help="Path to LoRA weights checkpoint",
+)
+parser.add_argument(
+    "--antelopv2_path",
+    type=str,
+    default="checkpoints/antelopev2",
+    help="Path to AntelopeV2 model checkpoint",
+)
+args = parser.parse_args()
+face_processor = FaceProcessor(antelopv2_path=args.antelopv2_path)
+ip_image = face_processor.process(args.ip_image)
+pipe = load_wan_pipe(base_path=args.base_path, torch_dtype=torch.bfloat16)
+set_stand_in(
+    pipe,
+    model_path=args.stand_in_path,
+)
+video = pipe(
+    prompt=args.prompt,
+    negative_prompt=args.negative_prompt,
+    seed=args.seed,
+    ip_image=ip_image,
+    num_inference_steps=args.num_inference_steps,
+    tiled=args.tiled,
+)
+save_video(video, args.output, fps=args.fps, quality=args.quality)

infer_face_swap.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+from data.video import save_video
+from wan_loader import load_wan_pipe
+from models.set_condition_branch import set_stand_in
+from preprocessor import FaceProcessor, VideoMaskGenerator
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--ip_image",
+    type=str,
+    default="test/input/ruonan.jpg",
+    help="Input face image path or URL",
+)
+parser.add_argument(
+    "--input_video",
+    type=str,
+    default="test/input/woman.mp4",
+    help="Input video path",
+)
+parser.add_argument(
+    "--denoising_strength",
+    type=float,
+    default=0.85,
+    help="The lower denoising strength represents a higher similarity to the original video.",
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="The video features a woman standing in front of a large screen displaying the words "
+    "Tech Minute"
+    " and the logo for CNET. She is wearing a purple top and appears to be presenting or speaking about technology-related topics. The background includes a cityscape with tall buildings, suggesting an urban setting. The woman seems to be engaged in a discussion or providing information on technology news or trends. The overall atmosphere is professional and informative, likely aimed at educating viewers about the latest developments in the tech industry.",
+    help="Text prompt for video generation",
+)
+parser.add_argument(
+    "--output",
+    type=str,
+    default="test/output/ruonan.mp4",
+    help="Output video file path",
+)
+parser.add_argument(
+    "--seed", type=int, default=0, help="Random seed for reproducibility"
+)
+parser.add_argument(
+    "--num_inference_steps", type=int, default=20, help="Number of inference steps"
+)
+parser.add_argument(
+    "--force_background_consistency",
+    type=bool,
+    default=False,
+    help="Set to True to force background consistency across generated frames.",
+)
+parser.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    help="Negative prompt to avoid unwanted features",
+)
+parser.add_argument("--tiled", action="store_true", help="Enable tiled mode")
+parser.add_argument(
+    "--fps", type=int, default=25, help="Frames per second for output video"
+)
+parser.add_argument(
+    "--quality", type=int, default=9, help="Output video quality (1-9)"
+)
+parser.add_argument(
+    "--base_path",
+    type=str,
+    default="checkpoints/base_model/",
+    help="Path to base model checkpoint",
+)
+parser.add_argument(
+    "--stand_in_path",
+    type=str,
+    default="checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt",
+    help="Path to LoRA weights checkpoint",
+)
+parser.add_argument(
+    "--antelopv2_path",
+    type=str,
+    default="checkpoints/antelopev2",
+    help="Path to AntelopeV2 model checkpoint",
+)
+args = parser.parse_args()
+face_processor = FaceProcessor(antelopv2_path=args.antelopv2_path)
+videomask_generator = VideoMaskGenerator(antelopv2_path=args.antelopv2_path)
+ip_image, ip_image_rgba = face_processor.process(args.ip_image, extra_input=True)
+input_video, face_mask, width, height, num_frames = videomask_generator.process(args.input_video, ip_image_rgba, random_horizontal_flip_chance=0.05, dilation_kernel_size=10)
+pipe = load_wan_pipe(
+    base_path=args.base_path, face_swap=True, torch_dtype=torch.bfloat16
+)
+set_stand_in(
+    pipe,
+    model_path=args.stand_in_path,
+)
+video = pipe(
+    prompt=args.prompt,
+    negative_prompt=args.negative_prompt,
+    seed=args.seed,
+    width=width,
+    height=height,
+    num_frames=num_frames,
+    denoising_strength=args.denoising_strength,
+    ip_image=ip_image,
+    face_mask=face_mask,
+    input_video=input_video,
+    num_inference_steps=args.num_inference_steps,
+    tiled=args.tiled,
+    force_background_consistency=args.force_background_consistency
+)
+save_video(video, args.output, fps=args.fps, quality=args.quality)

infer_with_lora.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from data.video import save_video
+from wan_loader import load_wan_pipe
+from models.set_condition_branch import set_stand_in
+from preprocessor import FaceProcessor
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--ip_image",
+    type=str,
+    default="test/input/lecun.jpg",
+    help="Input face image path or URL",
+)
+parser.add_argument(
+    "--lora_path", type=str, required=True, help="Text prompt for video generation"
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="Close-up of a young man with dark hair tied back, wearing a white kimono adorned with a red floral pattern. He sits against a backdrop of sliding doors with blue accents. His expression shifts from neutral to a slight smile, then to a surprised look. The camera remains static, focusing on his face and upper body as he appears to be reacting to something off-screen. The lighting is soft and natural, suggesting daytime.",
+    help="Text prompt for video generation",
+)
+parser.add_argument(
+    "--output", type=str, default="test/output/lecun.mp4", help="Output video file path"
+)
+parser.add_argument(
+    "--seed", type=int, default=0, help="Random seed for reproducibility"
+)
+parser.add_argument(
+    "--num_inference_steps", type=int, default=20, help="Number of inference steps"
+)
+parser.add_argument("--lora_scale", type=float, default=1.0, help="Lora Scale")
+parser.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    help="Negative prompt to avoid unwanted features",
+)
+parser.add_argument("--tiled", action="store_true", help="Enable tiled mode")
+parser.add_argument(
+    "--fps", type=int, default=25, help="Frames per second for output video"
+)
+parser.add_argument(
+    "--quality", type=int, default=9, help="Output video quality (1-9)"
+)
+parser.add_argument(
+    "--base_path",
+    type=str,
+    default="checkpoints/base_model/",
+    help="Path to base model checkpoint",
+)
+parser.add_argument(
+    "--stand_in_path",
+    type=str,
+    default="checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt",
+    help="Path to LoRA weights checkpoint",
+)
+parser.add_argument(
+    "--antelopv2_path",
+    type=str,
+    default="checkpoints/antelopev2",
+    help="Path to AntelopeV2 model checkpoint",
+)
+args = parser.parse_args()
+face_processor = FaceProcessor(antelopv2_path=args.antelopv2_path)
+ip_image = face_processor.process(args.ip_image)
+pipe = load_wan_pipe(base_path=args.base_path, torch_dtype=torch.bfloat16)
+pipe.load_lora(
+    pipe.dit,
+    args.lora_path,
+    alpha=1,
+)
+set_stand_in(
+    pipe,
+    model_path=args.stand_in_path,
+)
+video = pipe(
+    prompt=args.prompt,
+    negative_prompt=args.negative_prompt,
+    seed=args.seed,
+    ip_image=ip_image,
+    num_inference_steps=args.num_inference_steps,
+    tiled=args.tiled,
+)
+save_video(video, args.output, fps=args.fps, quality=args.quality)

infer_with_vace.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+from data.video import save_video
+from wan_loader import load_wan_pipe
+from models.set_condition_branch import set_stand_in
+from preprocessor import FaceProcessor
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--ip_image",
+    type=str,
+    default="test/input/first_frame.png",
+    help="Input face image path or URL",
+)
+parser.add_argument(
+    "--reference_video",
+    type=str,
+    default="test/input/pose.mp4",
+    help="reference_video path",
+)
+parser.add_argument(
+    "--reference_image",
+    default="test/input/first_frame.png",
+    type=str,
+    help="reference_video path",
+)
+parser.add_argument(
+    "--vace_scale",
+    type=float,
+    default=0.8,
+    help="Scaling factor for VACE.",
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="一个女人举起双手",
+    help="Text prompt for video generation",
+)
+parser.add_argument(
+    "--output", type=str, default="test/output/woman.mp4", help="Output video file path"
+)
+parser.add_argument(
+    "--seed", type=int, default=0, help="Random seed for reproducibility"
+)
+parser.add_argument(
+    "--num_inference_steps", type=int, default=20, help="Number of inference steps"
+)
+parser.add_argument(
+    "--vace_path",
+    type=str,
+    default="checkpoints/VACE/",
+    help="Path to base model checkpoint",
+)
+parser.add_argument(
+    "--negative_prompt",
+    type=str,
+    default="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+    help="Negative prompt to avoid unwanted features",
+)
+parser.add_argument("--tiled", action="store_true", help="Enable tiled mode")
+parser.add_argument(
+    "--fps", type=int, default=25, help="Frames per second for output video"
+)
+parser.add_argument(
+    "--quality", type=int, default=9, help="Output video quality (1-9)"
+)
+parser.add_argument(
+    "--stand_in_path",
+    type=str,
+    default="checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt",
+    help="Path to LoRA weights checkpoint",
+)
+parser.add_argument(
+    "--antelopv2_path",
+    type=str,
+    default="checkpoints/antelopev2",
+    help="Path to AntelopeV2 model checkpoint",
+)
+args = parser.parse_args()
+face_processor = FaceProcessor(antelopv2_path=args.antelopv2_path)
+ip_image = face_processor.process(args.ip_image)
+pipe = load_wan_pipe(base_path=args.vace_path, use_vace=True, torch_dtype=torch.bfloat16)
+set_stand_in(
+    pipe,
+    model_path=args.stand_in_path,
+)
+video = pipe(
+    prompt=args.prompt,
+    vace_video=args.reference_video,
+    vace_reference_image=args.reference_image,
+    negative_prompt=args.negative_prompt,
+    vace_scale=args.vace_scale,
+    seed=args.seed,
+    ip_image=ip_image,
+    num_inference_steps=args.num_inference_steps,
+    tiled=args.tiled,
+)
+save_video(video, args.output, fps=args.fps, quality=args.quality)

lora/__init__.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+class GeneralLoRALoader:
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        self.device = device
+        self.torch_dtype = torch_dtype
+    def get_name_dict(self, lora_state_dict):
+        lora_name_dict = {}
+        has_lora_A = any(k.endswith(".lora_A.weight") for k in lora_state_dict)
+        has_lora_down = any(k.endswith(".lora_down.weight") for k in lora_state_dict)
+        if has_lora_A:
+            lora_a_keys = [k for k in lora_state_dict if k.endswith(".lora_A.weight")]
+            for lora_a_key in lora_a_keys:
+                base_name = lora_a_key.replace(".lora_A.weight", "")
+                lora_b_key = base_name + ".lora_B.weight"
+                if lora_b_key in lora_state_dict:
+                    target_name = base_name.replace("diffusion_model.", "", 1)
+                    lora_name_dict[target_name] = (lora_b_key, lora_a_key)
+        elif has_lora_down:
+            lora_down_keys = [
+                k for k in lora_state_dict if k.endswith(".lora_down.weight")
+            ]
+            for lora_down_key in lora_down_keys:
+                base_name = lora_down_key.replace(".lora_down.weight", "")
+                lora_up_key = base_name + ".lora_up.weight"
+                if lora_up_key in lora_state_dict:
+                    target_name = base_name.replace("lora_unet_", "").replace("_", ".")
+                    target_name = target_name.replace(".attn.", "_attn.")
+                    lora_name_dict[target_name] = (lora_up_key, lora_down_key)
+        else:
+            print(
+                "Warning: No recognizable LoRA key names found in state_dict (neither 'lora_A' nor 'lora_down')."
+            )
+        return lora_name_dict
+    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        lora_name_dict = self.get_name_dict(state_dict_lora)
+        updated_num = 0
+        lora_target_names = set(lora_name_dict.keys())
+        model_layer_names = {
+            name for name, module in model.named_modules() if hasattr(module, "weight")
+        }
+        matched_names = lora_target_names.intersection(model_layer_names)
+        unmatched_lora_names = lora_target_names - model_layer_names
+        print(f"Successfully matched {len(matched_names)} layers.")
+        if unmatched_lora_names:
+            print(
+                f"Warning: {len(unmatched_lora_names)} LoRA layers not matched and will be ignored."
+            )
+        for name, module in model.named_modules():
+            if name in matched_names:
+                lora_b_key, lora_a_key = lora_name_dict[name]
+                weight_up = state_dict_lora[lora_b_key].to(
+                    device=self.device, dtype=self.torch_dtype
+                )
+                weight_down = state_dict_lora[lora_a_key].to(
+                    device=self.device, dtype=self.torch_dtype
+                )
+                if len(weight_up.shape) == 4:
+                    weight_up = weight_up.squeeze(3).squeeze(2)
+                    weight_down = weight_down.squeeze(3).squeeze(2)
+                    weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(
+                        2
+                    ).unsqueeze(3)
+                else:
+                    weight_lora = alpha * torch.mm(weight_up, weight_down)
+                if module.weight.shape != weight_lora.shape:
+                    print(f"Error: Shape mismatch for layer '{name}'! Skipping update.")
+                    continue
+                module.weight.data = (
+                    module.weight.data.to(weight_lora.device, dtype=weight_lora.dtype)
+                    + weight_lora
+                )
+                updated_num += 1
+        print(f"LoRA loading complete, updated {updated_num} tensors in total.\n")

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model_manager import *

models/attention.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+from einops import rearrange
+def low_version_attention(query, key, value, attn_bias=None):
+    scale = 1 / query.shape[-1] ** 0.5
+    query = query * scale
+    attn = torch.matmul(query, key.transpose(-2, -1))
+    if attn_bias is not None:
+        attn = attn + attn_bias
+    attn = attn.softmax(-1)
+    return attn @ value
+class Attention(torch.nn.Module):
+    def __init__(
+        self,
+        q_dim,
+        num_heads,
+        head_dim,
+        kv_dim=None,
+        bias_q=False,
+        bias_kv=False,
+        bias_out=False,
+    ):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
+        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+    def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
+        batch_size = q.shape[0]
+        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(
+            q, ip_k, ip_v
+        )
+        hidden_states = hidden_states + scale * ip_hidden_states
+        return hidden_states
+    def torch_forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attn_mask=None,
+        ipadapter_kwargs=None,
+        qkv_preprocessor=None,
+    ):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        if qkv_preprocessor is not None:
+            q, k, v = qkv_preprocessor(q, k, v)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask
+        )
+        if ipadapter_kwargs is not None:
+            hidden_states = self.interact_with_ipadapter(
+                hidden_states, q, **ipadapter_kwargs
+            )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, self.num_heads * self.head_dim
+        )
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def xformers_forward(
+        self, hidden_states, encoder_hidden_states=None, attn_mask=None
+    ):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
+        k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
+        v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
+        if attn_mask is not None:
+            hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
+        else:
+            import xformers.ops as xops
+            hidden_states = xops.memory_efficient_attention(q, k, v)
+        hidden_states = rearrange(
+            hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads
+        )
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attn_mask=None,
+        ipadapter_kwargs=None,
+        qkv_preprocessor=None,
+    ):
+        return self.torch_forward(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attn_mask=attn_mask,
+            ipadapter_kwargs=ipadapter_kwargs,
+            qkv_preprocessor=qkv_preprocessor,
+        )

models/downloader.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from huggingface_hub import hf_hub_download, snapshot_download
+import os, shutil
+from typing_extensions import Literal, TypeAlias
+from typing import List
+from configs.model_config import (
+    preset_models_on_huggingface,
+    preset_models_on_modelscope,
+    Preset_model_id,
+)
+def download_from_modelscope(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        snapshot_download(
+            model_id, allow_file_pattern=origin_file_path, local_dir=local_dir
+        )
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, os.path.split(origin_file_path)[-1])
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+def download_from_huggingface(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        hf_hub_download(model_id, origin_file_path, local_dir=local_dir)
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, file_name)
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+Preset_model_website: TypeAlias = Literal[
+    "HuggingFace",
+    "ModelScope",
+]
+website_to_preset_models = {
+    "HuggingFace": preset_models_on_huggingface,
+    "ModelScope": preset_models_on_modelscope,
+}
+website_to_download_fn = {
+    "HuggingFace": download_from_huggingface,
+    "ModelScope": download_from_modelscope,
+}
+def download_customized_models(
+    model_id,
+    origin_file_path,
+    local_dir,
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    downloaded_files = []
+    for website in downloading_priority:
+        # Check if the file is downloaded.
+        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+        if file_to_download in downloaded_files:
+            continue
+        # Download
+        website_to_download_fn[website](model_id, origin_file_path, local_dir)
+        if os.path.basename(origin_file_path) in os.listdir(local_dir):
+            downloaded_files.append(file_to_download)
+    return downloaded_files
+def download_models(
+    model_id_list: List[Preset_model_id] = [],
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    print(f"Downloading models: {model_id_list}")
+    downloaded_files = []
+    load_files = []
+    for model_id in model_id_list:
+        for website in downloading_priority:
+            if model_id in website_to_preset_models[website]:
+                # Parse model metadata
+                model_metadata = website_to_preset_models[website][model_id]
+                if isinstance(model_metadata, list):
+                    file_data = model_metadata
+                else:
+                    file_data = model_metadata.get("file_list", [])
+                # Try downloading the model from this website.
+                model_files = []
+                for model_id, origin_file_path, local_dir in file_data:
+                    # Check if the file is downloaded.
+                    file_to_download = os.path.join(
+                        local_dir, os.path.basename(origin_file_path)
+                    )
+                    if file_to_download in downloaded_files:
+                        continue
+                    # Download
+                    website_to_download_fn[website](
+                        model_id, origin_file_path, local_dir
+                    )
+                    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+                        downloaded_files.append(file_to_download)
+                        model_files.append(file_to_download)
+                # If the model is successfully downloaded, break.
+                if len(model_files) > 0:
+                    if (
+                        isinstance(model_metadata, dict)
+                        and "load_path" in model_metadata
+                    ):
+                        model_files = model_metadata["load_path"]
+                    load_files.extend(model_files)
+                    break
+    return load_files

models/model_manager.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import os, torch, json, importlib
+from typing import List
+from .downloader import (
+    download_models,
+    download_customized_models,
+    Preset_model_id,
+    Preset_model_website,
+)
+from configs.model_config import (
+    model_loader_configs,
+    huggingface_model_loader_configs,
+    patch_model_loader_configs,
+)
+from .utils import (
+    load_state_dict,
+    init_weights_on_device,
+    hash_state_dict_keys,
+    split_state_dict_with_prefix,
+)
+def load_model_from_single_file(
+    state_dict, model_names, model_classes, model_resource, torch_dtype, device
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        print(f"    model_name: {model_name} model_class: {model_class.__name__}")
+        state_dict_converter = model_class.state_dict_converter()
+        if model_resource == "civitai":
+            state_dict_results = state_dict_converter.from_civitai(state_dict)
+        elif model_resource == "diffusers":
+            state_dict_results = state_dict_converter.from_diffusers(state_dict)
+        if isinstance(state_dict_results, tuple):
+            model_state_dict, extra_kwargs = state_dict_results
+            print(
+                f"        This model is initialized with extra kwargs: {extra_kwargs}"
+            )
+        else:
+            model_state_dict, extra_kwargs = state_dict_results, {}
+        torch_dtype = (
+            torch.float32
+            if extra_kwargs.get("upcast_to_float32", False)
+            else torch_dtype
+        )
+        with init_weights_on_device():
+            model = model_class(**extra_kwargs)
+        if hasattr(model, "eval"):
+            model = model.eval()
+        model.load_state_dict(model_state_dict, assign=True)
+        model = model.to(dtype=torch_dtype, device=device)
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_model_from_huggingface_folder(
+    file_path, model_names, model_classes, torch_dtype, device
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        if torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            model = model_class.from_pretrained(
+                file_path, torch_dtype=torch_dtype
+            ).eval()
+        else:
+            model = model_class.from_pretrained(file_path).eval().to(dtype=torch_dtype)
+        if torch_dtype == torch.float16 and hasattr(model, "half"):
+            model = model.half()
+        try:
+            model = model.to(device=device)
+        except:
+            pass
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_single_patch_model_from_single_file(
+    state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device
+):
+    print(
+        f"    model_name: {model_name} model_class: {model_class.__name__} extra_kwargs: {extra_kwargs}"
+    )
+    base_state_dict = base_model.state_dict()
+    base_model.to("cpu")
+    del base_model
+    model = model_class(**extra_kwargs)
+    model.load_state_dict(base_state_dict, strict=False)
+    model.load_state_dict(state_dict, strict=False)
+    model.to(dtype=torch_dtype, device=device)
+    return model
+def load_patch_model_from_single_file(
+    state_dict,
+    model_names,
+    model_classes,
+    extra_kwargs,
+    model_manager,
+    torch_dtype,
+    device,
+):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        while True:
+            for model_id in range(len(model_manager.model)):
+                base_model_name = model_manager.model_name[model_id]
+                if base_model_name == model_name:
+                    base_model_path = model_manager.model_path[model_id]
+                    base_model = model_manager.model[model_id]
+                    print(
+                        f"    Adding patch model to {base_model_name} ({base_model_path})"
+                    )
+                    patched_model = load_single_patch_model_from_single_file(
+                        state_dict,
+                        model_name,
+                        model_class,
+                        base_model,
+                        extra_kwargs,
+                        torch_dtype,
+                        device,
+                    )
+                    loaded_model_names.append(base_model_name)
+                    loaded_models.append(patched_model)
+                    model_manager.model.pop(model_id)
+                    model_manager.model_path.pop(model_id)
+                    model_manager.model_name.pop(model_id)
+                    break
+            else:
+                break
+    return loaded_model_names, loaded_models
+class ModelDetectorTemplate:
+    def __init__(self):
+        pass
+    def match(self, file_path="", state_dict={}):
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        return [], []
+class ModelDetectorFromSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        self.keys_hash_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self,
+        keys_hash,
+        keys_hash_with_shape,
+        model_names,
+        model_classes,
+        model_resource,
+    ):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (
+            model_names,
+            model_classes,
+            model_resource,
+        )
+        if keys_hash is not None:
+            self.keys_hash_dict[keys_hash] = (
+                model_names,
+                model_classes,
+                model_resource,
+            )
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, model_resource = self.keys_hash_with_shape_dict[
+                keys_hash_with_shape
+            ]
+            loaded_model_names, loaded_models = load_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                model_resource,
+                torch_dtype,
+                device,
+            )
+            return loaded_model_names, loaded_models
+        # Load models without strict matching
+        # (the shape of parameters may be inconsistent, and the state_dict_converter will modify the model architecture)
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            model_names, model_classes, model_resource = self.keys_hash_dict[keys_hash]
+            loaded_model_names, loaded_models = load_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                model_resource,
+                torch_dtype,
+                device,
+            )
+            return loaded_model_names, loaded_models
+        return loaded_model_names, loaded_models
+class ModelDetectorFromSplitedSingleFile(ModelDetectorFromSingleFile):
+    def __init__(self, model_loader_configs=[]):
+        super().__init__(model_loader_configs)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        # Split the state_dict and load from each component
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        valid_state_dict = {}
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                valid_state_dict.update(sub_state_dict)
+        if super().match(file_path, valid_state_dict):
+            loaded_model_names, loaded_models = super().load(
+                file_path, valid_state_dict, device, torch_dtype
+            )
+        else:
+            loaded_model_names, loaded_models = [], []
+            for sub_state_dict in splited_state_dict:
+                if super().match(file_path, sub_state_dict):
+                    loaded_model_names_, loaded_models_ = super().load(
+                        file_path, valid_state_dict, device, torch_dtype
+                    )
+                    loaded_model_names += loaded_model_names_
+                    loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromHuggingfaceFolder:
+    def __init__(self, model_loader_configs=[]):
+        self.architecture_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self, architecture, huggingface_lib, model_name, redirected_architecture
+    ):
+        self.architecture_dict[architecture] = (
+            huggingface_lib,
+            model_name,
+            redirected_architecture,
+        )
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isfile(file_path):
+            return False
+        file_list = os.listdir(file_path)
+        if "config.json" not in file_list:
+            return False
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        if "architectures" not in config and "_class_name" not in config:
+            return False
+        return True
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        **kwargs,
+    ):
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        loaded_model_names, loaded_models = [], []
+        architectures = (
+            config["architectures"]
+            if "architectures" in config
+            else [config["_class_name"]]
+        )
+        for architecture in architectures:
+            huggingface_lib, model_name, redirected_architecture = (
+                self.architecture_dict[architecture]
+            )
+            if redirected_architecture is not None:
+                architecture = redirected_architecture
+            model_class = importlib.import_module(huggingface_lib).__getattribute__(
+                architecture
+            )
+            loaded_model_names_, loaded_models_ = load_model_from_huggingface_folder(
+                file_path, [model_name], [model_class], torch_dtype, device
+            )
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromPatchedSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(
+        self, keys_hash_with_shape, model_name, model_class, extra_kwargs
+    ):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (
+            model_name,
+            model_class,
+            extra_kwargs,
+        )
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        return False
+    def load(
+        self,
+        file_path="",
+        state_dict={},
+        device="cuda",
+        torch_dtype=torch.float16,
+        model_manager=None,
+        **kwargs,
+    ):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        loaded_model_names, loaded_models = [], []
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, extra_kwargs = self.keys_hash_with_shape_dict[
+                keys_hash_with_shape
+            ]
+            loaded_model_names_, loaded_models_ = load_patch_model_from_single_file(
+                state_dict,
+                model_names,
+                model_classes,
+                extra_kwargs,
+                model_manager,
+                torch_dtype,
+                device,
+            )
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelManager:
+    def __init__(
+        self,
+        torch_dtype=torch.float16,
+        device="cuda",
+        model_id_list: List[Preset_model_id] = [],
+        downloading_priority: List[Preset_model_website] = [
+            "ModelScope",
+            "HuggingFace",
+        ],
+        file_path_list: List[str] = [],
+    ):
+        self.torch_dtype = torch_dtype
+        self.device = device
+        self.model = []
+        self.model_path = []
+        self.model_name = []
+        downloaded_files = (
+            download_models(model_id_list, downloading_priority)
+            if len(model_id_list) > 0
+            else []
+        )
+        self.model_detector = [
+            ModelDetectorFromSingleFile(model_loader_configs),
+            ModelDetectorFromSplitedSingleFile(model_loader_configs),
+            ModelDetectorFromHuggingfaceFolder(huggingface_model_loader_configs),
+            ModelDetectorFromPatchedSingleFile(patch_model_loader_configs),
+        ]
+        self.load_models(downloaded_files + file_path_list)
+    def load_model_from_single_file(
+        self,
+        file_path="",
+        state_dict={},
+        model_names=[],
+        model_classes=[],
+        model_resource=None,
+    ):
+        print(f"Loading models from file: {file_path}")
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        model_names, models = load_model_from_single_file(
+            state_dict,
+            model_names,
+            model_classes,
+            model_resource,
+            self.torch_dtype,
+            self.device,
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_model_from_huggingface_folder(
+        self, file_path="", model_names=[], model_classes=[]
+    ):
+        print(f"Loading models from folder: {file_path}")
+        model_names, models = load_model_from_huggingface_folder(
+            file_path, model_names, model_classes, self.torch_dtype, self.device
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_patch_model_from_single_file(
+        self,
+        file_path="",
+        state_dict={},
+        model_names=[],
+        model_classes=[],
+        extra_kwargs={},
+    ):
+        print(f"Loading patch models from file: {file_path}")
+        model_names, models = load_patch_model_from_single_file(
+            state_dict,
+            model_names,
+            model_classes,
+            extra_kwargs,
+            self,
+            self.torch_dtype,
+            self.device,
+        )
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following patched models are loaded: {model_names}.")
+    def load_lora(self, file_path="", state_dict={}, lora_alpha=1.0):
+        if isinstance(file_path, list):
+            for file_path_ in file_path:
+                self.load_lora(file_path_, state_dict=state_dict, lora_alpha=lora_alpha)
+        else:
+            print(f"Loading LoRA models from file: {file_path}")
+            is_loaded = False
+            if len(state_dict) == 0:
+                state_dict = load_state_dict(file_path)
+            for model_name, model, model_path in zip(
+                self.model_name, self.model, self.model_path
+            ):
+                for lora in get_lora_loaders():
+                    match_results = lora.match(model, state_dict)
+                    if match_results is not None:
+                        print(f"    Adding LoRA to {model_name} ({model_path}).")
+                        lora_prefix, model_resource = match_results
+                        lora.load(
+                            model,
+                            state_dict,
+                            lora_prefix,
+                            alpha=lora_alpha,
+                            model_resource=model_resource,
+                        )
+                        is_loaded = True
+                        break
+            if not is_loaded:
+                print(f"    Cannot load LoRA: {file_path}")
+    def load_model(self, file_path, model_names=None, device=None, torch_dtype=None):
+        print(f"Loading models from: {file_path}")
+        if device is None:
+            device = self.device
+        if torch_dtype is None:
+            torch_dtype = self.torch_dtype
+        if isinstance(file_path, list):
+            state_dict = {}
+            for path in file_path:
+                state_dict.update(load_state_dict(path))
+        elif os.path.isfile(file_path):
+            state_dict = load_state_dict(file_path)
+        else:
+            state_dict = None
+        for model_detector in self.model_detector:
+            if model_detector.match(file_path, state_dict):
+                model_names, models = model_detector.load(
+                    file_path,
+                    state_dict,
+                    device=device,
+                    torch_dtype=torch_dtype,
+                    allowed_model_names=model_names,
+                    model_manager=self,
+                )
+                for model_name, model in zip(model_names, models):
+                    self.model.append(model)
+                    self.model_path.append(file_path)
+                    self.model_name.append(model_name)
+                print(f"    The following models are loaded: {model_names}.")
+                break
+        else:
+            print(f"    We cannot detect the model type. No models are loaded.")
+    def load_models(
+        self, file_path_list, model_names=None, device=None, torch_dtype=None
+    ):
+        for file_path in file_path_list:
+            self.load_model(
+                file_path, model_names, device=device, torch_dtype=torch_dtype
+            )
+    def fetch_model(
+        self, model_name, file_path=None, require_model_path=False, index=None
+    ):
+        fetched_models = []
+        fetched_model_paths = []
+        for model, model_path, model_name_ in zip(
+            self.model, self.model_path, self.model_name
+        ):
+            if file_path is not None and file_path != model_path:
+                continue
+            if model_name == model_name_:
+                fetched_models.append(model)
+                fetched_model_paths.append(model_path)
+        if len(fetched_models) == 0:
+            print(f"No {model_name} models available.")
+            return None
+        if len(fetched_models) == 1:
+            print(f"Using {model_name} from {fetched_model_paths[0]}.")
+            model = fetched_models[0]
+            path = fetched_model_paths[0]
+        else:
+            if index is None:
+                model = fetched_models[0]
+                path = fetched_model_paths[0]
+                print(
+                    f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}."
+                )
+            elif isinstance(index, int):
+                model = fetched_models[:index]
+                path = fetched_model_paths[:index]
+                print(
+                    f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[:index]}."
+                )
+            else:
+                model = fetched_models
+                path = fetched_model_paths
+                print(
+                    f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths}."
+                )
+        if require_model_path:
+            return model, path
+        else:
+            return model
+    def to(self, device):
+        for model in self.model:
+            model.to(device)

models/set_condition_branch.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+def set_stand_in(pipe, train=False, model_path=None):
+    for block in pipe.dit.blocks:
+        block.self_attn.init_lora(train)
+    if model_path is not None:
+        print(f"Loading Stand-In weights from: {model_path}")
+        load_lora_weights_into_pipe(pipe, model_path)
+def load_lora_weights_into_pipe(pipe, ckpt_path, strict=True):
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    state_dict = ckpt.get("state_dict", ckpt)
+    model = {}
+    for i, block in enumerate(pipe.dit.blocks):
+        prefix = f"blocks.{i}.self_attn."
+        attn = block.self_attn
+        for name in ["q_loras", "k_loras", "v_loras"]:
+            for sub in ["down", "up"]:
+                key = f"{prefix}{name}.{sub}.weight"
+                if hasattr(getattr(attn, name), sub):
+                    model[key] = getattr(getattr(attn, name), sub).weight
+                else:
+                    if strict:
+                        raise KeyError(f"Missing module: {key}")
+    for k, param in state_dict.items():
+        if k in model:
+            if model[k].shape != param.shape:
+                if strict:
+                    raise ValueError(
+                        f"Shape mismatch: {k} | {model[k].shape} vs {param.shape}"
+                    )
+                else:
+                    continue
+            model[k].data.copy_(param)
+        else:
+            if strict:
+                raise KeyError(f"Unexpected key in ckpt: {k}")

models/tiler.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import torch
+from einops import rearrange, repeat
+class TileWorker:
+    def __init__(self):
+        pass
+    def mask(self, height, width, border_width):
+        # Create a mask with shape (height, width).
+        # The centre area is filled with 1, and the border line is filled with values in range (0, 1].
+        x = torch.arange(height).repeat(width, 1).T
+        y = torch.arange(width).repeat(height, 1)
+        mask = torch.stack([x + 1, height - x, y + 1, width - y]).min(dim=0).values
+        mask = (mask / border_width).clip(0, 1)
+        return mask
+    def tile(self, model_input, tile_size, tile_stride, tile_device, tile_dtype):
+        # Convert a tensor (b, c, h, w) to (b, c, tile_size, tile_size, tile_num)
+        batch_size, channel, _, _ = model_input.shape
+        model_input = model_input.to(device=tile_device, dtype=tile_dtype)
+        unfold_operator = torch.nn.Unfold(
+            kernel_size=(tile_size, tile_size), stride=(tile_stride, tile_stride)
+        )
+        model_input = unfold_operator(model_input)
+        model_input = model_input.view((batch_size, channel, tile_size, tile_size, -1))
+        return model_input
+    def tiled_inference(
+        self,
+        forward_fn,
+        model_input,
+        tile_batch_size,
+        inference_device,
+        inference_dtype,
+        tile_device,
+        tile_dtype,
+    ):
+        # Call y=forward_fn(x) for each tile
+        tile_num = model_input.shape[-1]
+        model_output_stack = []
+        for tile_id in range(0, tile_num, tile_batch_size):
+            # process input
+            tile_id_ = min(tile_id + tile_batch_size, tile_num)
+            x = model_input[:, :, :, :, tile_id:tile_id_]
+            x = x.to(device=inference_device, dtype=inference_dtype)
+            x = rearrange(x, "b c h w n -> (n b) c h w")
+            # process output
+            y = forward_fn(x)
+            y = rearrange(y, "(n b) c h w -> b c h w n", n=tile_id_ - tile_id)
+            y = y.to(device=tile_device, dtype=tile_dtype)
+            model_output_stack.append(y)
+        model_output = torch.concat(model_output_stack, dim=-1)
+        return model_output
+    def io_scale(self, model_output, tile_size):
+        # Determine the size modification happened in forward_fn
+        # We only consider the same scale on height and width.
+        io_scale = model_output.shape[2] / tile_size
+        return io_scale
+    def untile(
+        self,
+        model_output,
+        height,
+        width,
+        tile_size,
+        tile_stride,
+        border_width,
+        tile_device,
+        tile_dtype,
+    ):
+        # The reversed function of tile
+        mask = self.mask(tile_size, tile_size, border_width)
+        mask = mask.to(device=tile_device, dtype=tile_dtype)
+        mask = rearrange(mask, "h w -> 1 1 h w 1")
+        model_output = model_output * mask
+        fold_operator = torch.nn.Fold(
+            output_size=(height, width),
+            kernel_size=(tile_size, tile_size),
+            stride=(tile_stride, tile_stride),
+        )
+        mask = repeat(mask[0, 0, :, :, 0], "h w -> 1 (h w) n", n=model_output.shape[-1])
+        model_output = rearrange(model_output, "b c h w n -> b (c h w) n")
+        model_output = fold_operator(model_output) / fold_operator(mask)
+        return model_output
+    def tiled_forward(
+        self,
+        forward_fn,
+        model_input,
+        tile_size,
+        tile_stride,
+        tile_batch_size=1,
+        tile_device="cpu",
+        tile_dtype=torch.float32,
+        border_width=None,
+    ):
+        # Prepare
+        inference_device, inference_dtype = model_input.device, model_input.dtype
+        height, width = model_input.shape[2], model_input.shape[3]
+        border_width = int(tile_stride * 0.5) if border_width is None else border_width
+        # tile
+        model_input = self.tile(
+            model_input, tile_size, tile_stride, tile_device, tile_dtype
+        )
+        # inference
+        model_output = self.tiled_inference(
+            forward_fn,
+            model_input,
+            tile_batch_size,
+            inference_device,
+            inference_dtype,
+            tile_device,
+            tile_dtype,
+        )
+        # resize
+        io_scale = self.io_scale(model_output, tile_size)
+        height, width = int(height * io_scale), int(width * io_scale)
+        tile_size, tile_stride = int(tile_size * io_scale), int(tile_stride * io_scale)
+        border_width = int(border_width * io_scale)
+        # untile
+        model_output = self.untile(
+            model_output,
+            height,
+            width,
+            tile_size,
+            tile_stride,
+            border_width,
+            tile_device,
+            tile_dtype,
+        )
+        # Done!
+        model_output = model_output.to(device=inference_device, dtype=inference_dtype)
+        return model_output
+class FastTileWorker:
+    def __init__(self):
+        pass
+    def build_mask(self, data, is_bound):
+        _, _, H, W = data.shape
+        h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
+        w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
+        border_width = (H + W) // 4
+        pad = torch.ones_like(h) * border_width
+        mask = (
+            torch.stack(
+                [
+                    pad if is_bound[0] else h + 1,
+                    pad if is_bound[1] else H - h,
+                    pad if is_bound[2] else w + 1,
+                    pad if is_bound[3] else W - w,
+                ]
+            )
+            .min(dim=0)
+            .values
+        )
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
+        mask = rearrange(mask, "H W -> 1 H W")
+        return mask
+    def tiled_forward(
+        self,
+        forward_fn,
+        model_input,
+        tile_size,
+        tile_stride,
+        tile_device="cpu",
+        tile_dtype=torch.float32,
+        border_width=None,
+    ):
+        # Prepare
+        B, C, H, W = model_input.shape
+        border_width = int(tile_stride * 0.5) if border_width is None else border_width
+        weight = torch.zeros((1, 1, H, W), dtype=tile_dtype, device=tile_device)
+        values = torch.zeros((B, C, H, W), dtype=tile_dtype, device=tile_device)
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride):
+            for w in range(0, W, tile_stride):
+                if (h - tile_stride >= 0 and h - tile_stride + tile_size >= H) or (
+                    w - tile_stride >= 0 and w - tile_stride + tile_size >= W
+                ):
+                    continue
+                h_, w_ = h + tile_size, w + tile_size
+                if h_ > H:
+                    h, h_ = H - tile_size, H
+                if w_ > W:
+                    w, w_ = W - tile_size, W
+                tasks.append((h, h_, w, w_))
+        # Run
+        for hl, hr, wl, wr in tasks:
+            # Forward
+            hidden_states_batch = forward_fn(hl, hr, wl, wr).to(
+                dtype=tile_dtype, device=tile_device
+            )
+            mask = self.build_mask(
+                hidden_states_batch, is_bound=(hl == 0, hr >= H, wl == 0, wr >= W)
+            )
+            values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
+            weight[:, :, hl:hr, wl:wr] += mask
+        values /= weight
+        return values
+class TileWorker2Dto3D:
+    """
+    Process 3D tensors, but only enable TileWorker on 2D.
+    """
+    def __init__(self):
+        pass
+    def build_mask(self, T, H, W, dtype, device, is_bound, border_width):
+        t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
+        h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
+        w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
+        border_width = (H + W) // 4 if border_width is None else border_width
+        pad = torch.ones_like(h) * border_width
+        mask = (
+            torch.stack(
+                [
+                    pad if is_bound[0] else t + 1,
+                    pad if is_bound[1] else T - t,
+                    pad if is_bound[2] else h + 1,
+                    pad if is_bound[3] else H - h,
+                    pad if is_bound[4] else w + 1,
+                    pad if is_bound[5] else W - w,
+                ]
+            )
+            .min(dim=0)
+            .values
+        )
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=dtype, device=device)
+        mask = rearrange(mask, "T H W -> 1 1 T H W")
+        return mask
+    def tiled_forward(
+        self,
+        forward_fn,
+        model_input,
+        tile_size,
+        tile_stride,
+        tile_device="cpu",
+        tile_dtype=torch.float32,
+        computation_device="cuda",
+        computation_dtype=torch.float32,
+        border_width=None,
+        scales=[1, 1, 1, 1],
+        progress_bar=lambda x: x,
+    ):
+        B, C, T, H, W = model_input.shape
+        scale_C, scale_T, scale_H, scale_W = scales
+        tile_size_H, tile_size_W = tile_size
+        tile_stride_H, tile_stride_W = tile_stride
+        value = torch.zeros(
+            (B, int(C * scale_C), int(T * scale_T), int(H * scale_H), int(W * scale_W)),
+            dtype=tile_dtype,
+            device=tile_device,
+        )
+        weight = torch.zeros(
+            (1, 1, int(T * scale_T), int(H * scale_H), int(W * scale_W)),
+            dtype=tile_dtype,
+            device=tile_device,
+        )
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride_H):
+            for w in range(0, W, tile_stride_W):
+                if (
+                    h - tile_stride_H >= 0 and h - tile_stride_H + tile_size_H >= H
+                ) or (w - tile_stride_W >= 0 and w - tile_stride_W + tile_size_W >= W):
+                    continue
+                h_, w_ = h + tile_size_H, w + tile_size_W
+                if h_ > H:
+                    h, h_ = max(H - tile_size_H, 0), H
+                if w_ > W:
+                    w, w_ = max(W - tile_size_W, 0), W
+                tasks.append((h, h_, w, w_))
+        # Run
+        for hl, hr, wl, wr in progress_bar(tasks):
+            mask = self.build_mask(
+                int(T * scale_T),
+                int((hr - hl) * scale_H),
+                int((wr - wl) * scale_W),
+                tile_dtype,
+                tile_device,
+                is_bound=(True, True, hl == 0, hr >= H, wl == 0, wr >= W),
+                border_width=border_width,
+            )
+            grid_input = model_input[:, :, :, hl:hr, wl:wr].to(
+                dtype=computation_dtype, device=computation_device
+            )
+            grid_output = forward_fn(grid_input).to(
+                dtype=tile_dtype, device=tile_device
+            )
+            value[
+                :,
+                :,
+                :,
+                int(hl * scale_H) : int(hr * scale_H),
+                int(wl * scale_W) : int(wr * scale_W),
+            ] += grid_output * mask
+            weight[
+                :,
+                :,
+                :,
+                int(hl * scale_H) : int(hr * scale_H),
+                int(wl * scale_W) : int(wr * scale_W),
+            ] += mask
+        value = value / weight
+        return value

models/utils.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import torch, os
+from safetensors import safe_open
+from contextlib import contextmanager
+import hashlib
+@contextmanager
+def init_weights_on_device(device=torch.device("meta"), include_buffers: bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs
+            )
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(
+                torch,
+                torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)),
+            )
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for (
+            torch_function_name,
+            old_torch_function,
+        ) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors",
+            "bin",
+            "ckpt",
+            "pth",
+            "pt",
+        ]:
+            state_dict.update(
+                load_state_dict(
+                    os.path.join(file_path, file_name), torch_dtype=torch_dtype
+                )
+            )
+    return state_dict
+def load_state_dict(file_path, torch_dtype=None, device="cpu"):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(
+            file_path, torch_dtype=torch_dtype, device=device
+        )
+    else:
+        return load_state_dict_from_bin(
+            file_path, torch_dtype=torch_dtype, device=device
+        )
+def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device=str(device)) as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
+    state_dict = torch.load(file_path, map_location=device, weights_only=True)
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def search_for_embeddings(state_dict):
+    embeddings = []
+    for k in state_dict:
+        if isinstance(state_dict[k], torch.Tensor):
+            embeddings.append(state_dict[k])
+        elif isinstance(state_dict[k], dict):
+            embeddings += search_for_embeddings(state_dict[k])
+    return embeddings
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-3:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
+                    return name
+    return None
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif (
+                split_qkv
+                and len(source_state_dict[name].shape) >= 1
+                and source_state_dict[name].shape[0] % 3 == 0
+            ):
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(
+                        search_parameter(
+                            source_state_dict[name][i * length : i * length + length],
+                            target_state_dict,
+                        )
+                    )
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(
+                    key
+                    + "|"
+                    + convert_state_dict_keys_to_single_str(
+                        value, with_shape=with_shape
+                    )
+                )
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()

models/wan_video_camera_controller.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+import os
+from typing_extensions import Literal
+class SimpleAdapter(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_blocks=1):
+        super(SimpleAdapter, self).__init__()
+        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
+        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=8)
+        # Convolution: reduce spatial dimensions by a factor
+        #  of 2 (without overlap)
+        self.conv = nn.Conv2d(
+            in_dim * 64, out_dim, kernel_size=kernel_size, stride=stride, padding=0
+        )
+        # Residual blocks for feature extraction
+        self.residual_blocks = nn.Sequential(
+            *[ResidualBlock(out_dim) for _ in range(num_residual_blocks)]
+        )
+    def forward(self, x):
+        # Reshape to merge the frame dimension into batch
+        bs, c, f, h, w = x.size()
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
+        # Pixel Unshuffle operation
+        x_unshuffled = self.pixel_unshuffle(x)
+        # Convolution operation
+        x_conv = self.conv(x_unshuffled)
+        # Feature extraction with residual blocks
+        out = self.residual_blocks(x_conv)
+        # Reshape to restore original bf dimension
+        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
+        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
+        out = out.permute(0, 2, 1, 3, 4)
+        return out
+    def process_camera_coordinates(
+        self,
+        direction: Literal[
+            "Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"
+        ],
+        length: int,
+        height: int,
+        width: int,
+        speed: float = 1 / 54,
+        origin=(
+            0,
+            0.532139961,
+            0.946026558,
+            0.5,
+            0.5,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+        ),
+    ):
+        if origin is None:
+            origin = (
+                0,
+                0.532139961,
+                0.946026558,
+                0.5,
+                0.5,
+                0,
+                0,
+                1,
+                0,
+                0,
+                0,
+                0,
+                1,
+                0,
+                0,
+                0,
+                0,
+                1,
+                0,
+            )
+        coordinates = generate_camera_coordinates(direction, length, speed, origin)
+        plucker_embedding = process_pose_file(coordinates, width, height)
+        return plucker_embedding
+class ResidualBlock(nn.Module):
+    def __init__(self, dim):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.conv1(x))
+        out = self.conv2(out)
+        out += residual
+        return out
+class Camera(object):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py"""
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def get_relative_pose(cam_params):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py"""
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    cam_to_origin = 0
+    target_cam_c2w = np.array(
+        [[1, 0, 0, 0], [0, 1, 0, -cam_to_origin], [0, 0, 1, 0], [0, 0, 0, 1]]
+    )
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [
+        target_cam_c2w,
+    ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def custom_meshgrid(*args):
+    # torch>=2.0.0 only
+    return torch.meshgrid(*args, indexing="ij")
+def ray_condition(K, c2w, H, W, device):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py"""
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B = K.shape[0]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+def process_pose_file(
+    cam_params,
+    width=672,
+    height=384,
+    original_pose_width=1280,
+    original_pose_height=720,
+    device="cpu",
+    return_poses=False,
+):
+    if return_poses:
+        return cam_params
+    else:
+        cam_params = [Camera(cam_param) for cam_param in cam_params]
+        sample_wh_ratio = width / height
+        pose_wh_ratio = (
+            original_pose_width / original_pose_height
+        )  # Assuming placeholder ratios, change as needed
+        if pose_wh_ratio > sample_wh_ratio:
+            resized_ori_w = height * pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fx = resized_ori_w * cam_param.fx / width
+        else:
+            resized_ori_h = width / pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fy = resized_ori_h * cam_param.fy / height
+        intrinsic = np.asarray(
+            [
+                [
+                    cam_param.fx * width,
+                    cam_param.fy * height,
+                    cam_param.cx * width,
+                    cam_param.cy * height,
+                ]
+                for cam_param in cam_params
+            ],
+            dtype=np.float32,
+        )
+        K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+        c2ws = get_relative_pose(
+            cam_params
+        )  # Assuming this function is defined elsewhere
+        c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+        plucker_embedding = (
+            ray_condition(K, c2ws, height, width, device=device)[0]
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )  # V, 6, H, W
+        plucker_embedding = plucker_embedding[None]
+        plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
+        return plucker_embedding
+def generate_camera_coordinates(
+    direction: Literal[
+        "Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"
+    ],
+    length: int,
+    speed: float = 1 / 54,
+    origin=(
+        0,
+        0.532139961,
+        0.946026558,
+        0.5,
+        0.5,
+        0,
+        0,
+        1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        0,
+        0,
+        0,
+        0,
+        1,
+        0,
+    ),
+):
+    coordinates = [list(origin)]
+    while len(coordinates) < length:
+        coor = coordinates[-1].copy()
+        if "Left" in direction:
+            coor[9] += speed
+        if "Right" in direction:
+            coor[9] -= speed
+        if "Up" in direction:
+            coor[13] += speed
+        if "Down" in direction:
+            coor[13] -= speed
+        coordinates.append(coor)
+    return coordinates

models/wan_video_dit.py ADDED Viewed

	@@ -0,0 +1,952 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Tuple, Optional
+from einops import rearrange
+from .utils import hash_state_dict_keys
+from .wan_video_camera_controller import SimpleAdapter
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+def flash_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    num_heads: int,
+    compatibility_mode=False,
+):
+    if compatibility_mode:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_3_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn_interface.flash_attn_func(q, k, v)
+        if isinstance(x, tuple):
+            x = x[0]
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_2_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif SAGE_ATTN_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = sageattn(q, k, v, tensor_layout="HND", is_causal=False)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    else:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    return x
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
+    return x * (1 + scale) + shift
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(
+        position.type(torch.float64),
+        torch.pow(
+            10000,
+            -torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(
+                dim // 2
+            ),
+        ),
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def precompute_freqs_cis_3d(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 3d rope precompute
+    f_freqs_cis = precompute_freqs_cis(dim - 2 * (dim // 3), end + 1, theta)
+    h_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    w_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    return f_freqs_cis, h_freqs_cis, w_freqs_cis
+def precompute_freqs_cis(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 1d rope precompute
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].double() / dim))
+    ######################################################     add  f = -1
+    positions = torch.arange(-1, end, device=freqs.device)
+    freqs = torch.outer(positions, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    ######################################################
+    return freqs_cis
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    x_out = torch.view_as_complex(
+        x.to(torch.float64).reshape(x.shape[0], x.shape[1], x.shape[2], -1, 2)
+    )
+    x_out = torch.view_as_real(x_out * freqs).flatten(2)
+    return x_out.to(x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        dtype = x.dtype
+        return self.norm(x.float()).to(dtype) * self.weight
+class AttentionModule(nn.Module):
+    def __init__(self, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+    def forward(self, q, k, v):
+        x = flash_attention(q=q, k=k, v=v, num_heads=self.num_heads)
+        return x
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 128,
+        device="cuda",
+        dtype: Optional[torch.dtype] = torch.float32,
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        return up_hidden_states.to(orig_dtype)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+        self.kv_cache = None
+        self.cond_size = None
+    def init_lora(self, train=False):
+        dim = self.dim
+        self.q_loras = LoRALinearLayer(dim, dim, rank=128)
+        self.k_loras = LoRALinearLayer(dim, dim, rank=128)
+        self.v_loras = LoRALinearLayer(dim, dim, rank=128)
+        requires_grad = train
+        for lora in [self.q_loras, self.k_loras, self.v_loras]:
+            for param in lora.parameters():
+                param.requires_grad = requires_grad
+    def forward(self, x, freqs):
+        if self.cond_size is not None:
+            if self.kv_cache is None:
+                x_main, x_ip = x[:, : -self.cond_size], x[:, -self.cond_size :]
+                split_point = freqs.shape[0] - self.cond_size
+                freqs_main = freqs[:split_point]
+                freqs_ip = freqs[split_point:]
+                q_main = self.norm_q(self.q(x_main))
+                k_main = self.norm_k(self.k(x_main))
+                v_main = self.v(x_main)
+                q_main = rope_apply(q_main, freqs_main, self.num_heads)
+                k_main = rope_apply(k_main, freqs_main, self.num_heads)
+                q_ip = self.norm_q(self.q(x_ip) + self.q_loras(x_ip))
+                k_ip = self.norm_k(self.k(x_ip) + self.k_loras(x_ip))
+                v_ip = self.v(x_ip) + self.v_loras(x_ip)
+                q_ip = rope_apply(q_ip, freqs_ip, self.num_heads)
+                k_ip = rope_apply(k_ip, freqs_ip, self.num_heads)
+                self.kv_cache = {"k_ip": k_ip.detach(), "v_ip": v_ip.detach()}
+                full_k = torch.concat([k_main, k_ip], dim=1)
+                full_v = torch.concat([v_main, v_ip], dim=1)
+                cond_out = self.attn(q_ip, k_ip, v_ip)
+                main_out = self.attn(q_main, full_k, full_v)
+                out = torch.concat([main_out, cond_out], dim=1)
+                return self.o(out)
+            else:
+                k_ip = self.kv_cache["k_ip"]
+                v_ip = self.kv_cache["v_ip"]
+                q_main = self.norm_q(self.q(x))
+                k_main = self.norm_k(self.k(x))
+                v_main = self.v(x)
+                q_main = rope_apply(q_main, freqs, self.num_heads)
+                k_main = rope_apply(k_main, freqs, self.num_heads)
+                full_k = torch.concat([k_main, k_ip], dim=1)
+                full_v = torch.concat([v_main, v_ip], dim=1)
+                x = self.attn(q_main, full_k, full_v)
+                return self.o(x)
+        else:
+            q = self.norm_q(self.q(x))
+            k = self.norm_k(self.k(x))
+            v = self.v(x)
+            q = rope_apply(q, freqs, self.num_heads)
+            k = rope_apply(k, freqs, self.num_heads)
+            x = self.attn(q, k, v)
+            return self.o(x)
+class CrossAttention(nn.Module):
+    def __init__(
+        self, dim: int, num_heads: int, eps: float = 1e-6, has_image_input: bool = False
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim)
+            self.v_img = nn.Linear(dim, dim)
+            self.norm_k_img = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        x = self.attn(q, k, v)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = flash_attention(q, k_img, v_img, num_heads=self.num_heads)
+            x = x + y
+        return self.o(x)
+class GateModule(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def forward(self, x, gate, residual):
+        return x + gate * residual
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        has_image_input: bool,
+        dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.self_attn = SelfAttention(dim, num_heads, eps)
+        self.cross_attn = CrossAttention(
+            dim, num_heads, eps, has_image_input=has_image_input
+        )
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.gate = GateModule()
+    def forward(self, x, context, t_mod, freqs, x_ip=None, t_mod_ip=None):
+        # msa: multi-head self-attention  mlp: multi-layer perceptron
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod
+        ).chunk(6, dim=1)
+        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
+        if x_ip is not None:
+            (
+                shift_msa_ip,
+                scale_msa_ip,
+                gate_msa_ip,
+                shift_mlp_ip,
+                scale_mlp_ip,
+                gate_mlp_ip,
+            ) = (
+                self.modulation.to(dtype=t_mod_ip.dtype, device=t_mod_ip.device)
+                + t_mod_ip
+            ).chunk(6, dim=1)
+            input_x_ip = modulate(
+                self.norm1(x_ip), shift_msa_ip, scale_msa_ip
+            )  # [1, 1024, 5120]
+            self.self_attn.cond_size = input_x_ip.shape[1]
+            input_x = torch.concat([input_x, input_x_ip], dim=1)
+            self.self_attn.kv_cache = None
+        attn_out = self.self_attn(input_x, freqs)
+        if x_ip is not None:
+            attn_out, attn_out_ip = (
+                attn_out[:, : -self.self_attn.cond_size],
+                attn_out[:, -self.self_attn.cond_size :],
+            )
+        x = self.gate(x, gate_msa, attn_out)
+        x = x + self.cross_attn(self.norm3(x), context)
+        input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = self.gate(x, gate_mlp, self.ffn(input_x))
+        if x_ip is not None:
+            x_ip = self.gate(x_ip, gate_msa_ip, attn_out_ip)
+            input_x_ip = modulate(self.norm2(x_ip), shift_mlp_ip, scale_mlp_ip)
+            x_ip = self.gate(x_ip, gate_mlp_ip, self.ffn(input_x_ip))
+        return x, x_ip
+class MLP(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, has_pos_emb=False):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, in_dim),
+            nn.GELU(),
+            nn.Linear(in_dim, out_dim),
+            nn.LayerNorm(out_dim),
+        )
+        self.has_pos_emb = has_pos_emb
+        if has_pos_emb:
+            self.emb_pos = torch.nn.Parameter(torch.zeros((1, 514, 1280)))
+    def forward(self, x):
+        if self.has_pos_emb:
+            x = x + self.emb_pos.to(dtype=x.dtype, device=x.device)
+        return self.proj(x)
+class Head(nn.Module):
+    def __init__(
+        self, dim: int, out_dim: int, patch_size: Tuple[int, int, int], eps: float
+    ):
+        super().__init__()
+        self.dim = dim
+        self.patch_size = patch_size
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.head = nn.Linear(dim, out_dim * math.prod(patch_size))
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, t_mod):
+        if len(t_mod.shape) == 3:
+            shift, scale = (
+                self.modulation.unsqueeze(0).to(dtype=t_mod.dtype, device=t_mod.device)
+                + t_mod.unsqueeze(2)
+            ).chunk(2, dim=2)
+            x = self.head(self.norm(x) * (1 + scale.squeeze(2)) + shift.squeeze(2))
+        else:
+            shift, scale = (
+                self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod
+            ).chunk(2, dim=1)
+            x = self.head(self.norm(x) * (1 + scale) + shift)
+        return x
+class WanModel(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        in_dim: int,
+        ffn_dim: int,
+        out_dim: int,
+        text_dim: int,
+        freq_dim: int,
+        eps: float,
+        patch_size: Tuple[int, int, int],
+        num_heads: int,
+        num_layers: int,
+        has_image_input: bool,
+        has_image_pos_emb: bool = False,
+        has_ref_conv: bool = False,
+        add_control_adapter: bool = False,
+        in_dim_control_adapter: int = 24,
+        seperated_timestep: bool = False,
+        require_vae_embedding: bool = True,
+        require_clip_embedding: bool = True,
+        fuse_vae_embedding_in_latents: bool = False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freq_dim = freq_dim
+        self.has_image_input = has_image_input
+        self.patch_size = patch_size
+        self.seperated_timestep = seperated_timestep
+        self.require_vae_embedding = require_vae_embedding
+        self.require_clip_embedding = require_clip_embedding
+        self.fuse_vae_embedding_in_latents = fuse_vae_embedding_in_latents
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        self.blocks = nn.ModuleList(
+            [
+                DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps)
+                for _ in range(num_layers)
+            ]
+        )
+        self.head = Head(dim, out_dim, patch_size, eps)
+        head_dim = dim // num_heads
+        self.freqs = precompute_freqs_cis_3d(head_dim)
+        if has_image_input:
+            self.img_emb = MLP(
+                1280, dim, has_pos_emb=has_image_pos_emb
+            )  # clip_feature_dim = 1280
+        if has_ref_conv:
+            self.ref_conv = nn.Conv2d(16, dim, kernel_size=(2, 2), stride=(2, 2))
+        self.has_image_pos_emb = has_image_pos_emb
+        self.has_ref_conv = has_ref_conv
+        if add_control_adapter:
+            self.control_adapter = SimpleAdapter(
+                in_dim_control_adapter,
+                dim,
+                kernel_size=patch_size[1:],
+                stride=patch_size[1:],
+            )
+        else:
+            self.control_adapter = None
+    def patchify(
+        self, x: torch.Tensor, control_camera_latents_input: torch.Tensor = None
+    ):
+        x = self.patch_embedding(x)
+        if (
+            self.control_adapter is not None
+            and control_camera_latents_input is not None
+        ):
+            y_camera = self.control_adapter(control_camera_latents_input)
+            x = [u + v for u, v in zip(x, y_camera)]
+            x = x[0].unsqueeze(0)
+        grid_size = x.shape[2:]
+        x = rearrange(x, "b c f h w -> b (f h w) c").contiguous()
+        return x, grid_size  # x, grid_size: (f, h, w)
+    def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
+        return rearrange(
+            x,
+            "b (f h w) (x y z c) -> b c (f x) (h y) (w z)",
+            f=grid_size[0],
+            h=grid_size[1],
+            w=grid_size[2],
+            x=self.patch_size[0],
+            y=self.patch_size[1],
+            z=self.patch_size[2],
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+        context: torch.Tensor,
+        clip_feature: Optional[torch.Tensor] = None,
+        y: Optional[torch.Tensor] = None,
+        use_gradient_checkpointing: bool = False,
+        use_gradient_checkpointing_offload: bool = False,
+        ip_image=None,
+        **kwargs,
+    ):
+        x_ip = None
+        t_mod_ip = None
+        t = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, timestep))
+        t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+        context = self.text_embedding(context)
+        if ip_image is not None:
+            timestep_ip = torch.zeros_like(timestep)  # [B] with 0s
+            t_ip = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, timestep_ip)
+            )
+            t_mod_ip = self.time_projection(t_ip).unflatten(1, (6, self.dim))
+        x, (f, h, w) = self.patchify(x)
+        offset = 1
+        freqs = (
+            torch.cat(
+                [
+                    self.freqs[0][offset : f + offset]
+                    .view(f, 1, 1, -1)
+                    .expand(f, h, w, -1),
+                    self.freqs[1][offset : h + offset]
+                    .view(1, h, 1, -1)
+                    .expand(f, h, w, -1),
+                    self.freqs[2][offset : w + offset]
+                    .view(1, 1, w, -1)
+                    .expand(f, h, w, -1),
+                ],
+                dim=-1,
+            )
+            .reshape(f * h * w, 1, -1)
+            .to(x.device)
+        )
+        ############################################################################################
+        if ip_image is not None:
+            if ip_image.dim() == 6 and ip_image.shape[3] == 1:
+                ip_image = ip_image.squeeze(1)
+            x_ip, (f_ip, h_ip, w_ip) = self.patchify(
+                ip_image
+            )  # x_ip [1, 1024, 5120] [B, N, D]   f_ip = 1  h_ip = 32  w_ip = 32
+            freqs_ip = (
+                torch.cat(
+                    [
+                        self.freqs[0][0]
+                        .view(f_ip, 1, 1, -1)
+                        .expand(f_ip, h_ip, w_ip, -1),
+                        self.freqs[1][h + offset : h + offset + h_ip]
+                        .view(1, h_ip, 1, -1)
+                        .expand(f_ip, h_ip, w_ip, -1),
+                        self.freqs[2][w + offset : w + offset + w_ip]
+                        .view(1, 1, w_ip, -1)
+                        .expand(f_ip, h_ip, w_ip, -1),
+                    ],
+                    dim=-1,
+                )
+                .reshape(f_ip * h_ip * w_ip, 1, -1)
+                .to(x_ip.device)
+            )
+            freqs = torch.cat([freqs, freqs_ip], dim=0)
+        ############################################################################################
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block in self.blocks:
+            if self.training and use_gradient_checkpointing:
+                if use_gradient_checkpointing_offload:
+                    with torch.autograd.graph.save_on_cpu():
+                        x, x_ip = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            x,
+                            context,
+                            t_mod,
+                            freqs,
+                            x_ip,
+                            t_mod_ip,
+                            use_reentrant=False,
+                        )
+                else:
+                    x, x_ip = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        x_ip,
+                        t_mod_ip,
+                        use_reentrant=False,
+                    )
+            else:
+                x, x_ip = block(x, context, t_mod, freqs, x_ip, t_mod_ip)
+        x = self.head(x, t)
+        x = self.unpatchify(x, (f, h, w))
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanModelStateDictConverter()
+class WanModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+            "blocks.0.scale_shift_table": "blocks.0.modulation",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "scale_shift_table": "head.modulation",
+            "proj_out.bias": "head.head.bias",
+            "proj_out.weight": "head.head.weight",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                name_ = ".".join(name.split(".")[:1] + ["0"] + name.split(".")[2:])
+                if name_ in rename_dict:
+                    name_ = rename_dict[name_]
+                    name_ = ".".join(
+                        name_.split(".")[:1]
+                        + [name.split(".")[1]]
+                        + name_.split(".")[2:]
+                    )
+                    state_dict_[name_] = param
+        if hash_state_dict_keys(state_dict) == "cb104773c6c2cb6df4f9529ad5c60d0b":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict_, config
+    def from_civitai(self, state_dict):
+        state_dict = {
+            name: param
+            for name, param in state_dict.items()
+            if not name.startswith("vace")
+        }
+        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6d6ccde6845b95ad9114ab993d917893":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
+            # 1.3B PAI control
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
+            # 14B PAI control
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "3ef3b1f8e1dab83d5b71fd7b617f859f":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_image_pos_emb": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "70ddad9d3a133785da5ea371aae09504":
+            # 1.3B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "26bde73488a92e64cc20b0a7485b9e5b":
+            # 14B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "ac6a5aa74f4a0aab6f64eb9a72f19901":
+            # 1.3B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
+        elif hash_state_dict_keys(state_dict) == "b61c605c2adbd23124d152ed28e049ae":
+            # 14B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
+        elif hash_state_dict_keys(state_dict) == "1f5ab7703c6fc803fdded85ff040c316":
+            # Wan-AI/Wan2.2-TI2V-5B
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 3072,
+                "ffn_dim": 14336,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 48,
+                "num_heads": 24,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "seperated_timestep": True,
+                "require_clip_embedding": False,
+                "require_vae_embedding": False,
+                "fuse_vae_embedding_in_latents": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "5b013604280dd715f8457c6ed6d6a626":
+            # Wan-AI/Wan2.2-I2V-A14B
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "require_clip_embedding": False,
+            }
+        else:
+            config = {}
+        return state_dict, config

models/wan_video_image_encoder.py ADDED Viewed

	@@ -0,0 +1,957 @@

+"""
+Concise re-implementation of
+``https://github.com/openai/CLIP'' and
+``https://github.com/mlfoundations/open_clip''.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .wan_video_dit import flash_attention
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4),
+            nn.GELU(),
+            nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout),
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(
+        self,
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList(
+            [
+                AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+                for _ in range(num_layers)
+            ]
+        )
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = (
+            self.token_embedding(ids)
+            + self.type_embedding(torch.zeros_like(ids))
+            + self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        )
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(mask.view(b, 1, 1, s).gt(0), 0.0, torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False, return_tokenizer=False, device="cpu", **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5,
+    )
+    cfg.update(**kwargs)
+    # init model
+    if pretrained:
+        from sora import DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device("meta"):
+            model = XLMRoberta(**cfg)
+        # load checkpoint
+        model.load_state_dict(
+            torch.load(
+                DOWNLOAD_TO_CACHE("models/xlm_roberta/xlm_roberta_large.pth"),
+                map_location=device,
+            ),
+            assign=True,
+        )
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = XLMRoberta(**cfg)
+    # init tokenizer
+    if return_tokenizer:
+        from sora.data import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(
+            name="xlm-roberta-large", seq_len=model.text_len, clean="whitespace"
+        )
+        return model, tokenizer
+    else:
+        return model
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat(
+            [
+                pos[:, :n],
+                F.interpolate(
+                    pos[:, n:]
+                    .float()
+                    .reshape(1, src_grid, src_grid, -1)
+                    .permute(0, 3, 1, 2),
+                    size=(tar_grid, tar_grid),
+                    mode="bicubic",
+                    align_corners=False,
+                )
+                .flatten(2)
+                .transpose(1, 2),
+            ],
+            dim=1,
+        )
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(
+        self, dim, num_heads, causal=False, attn_dropout=0.0, proj_dropout=0.0
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        # compute attention
+        x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        post_norm=False,
+        causal=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert activation in ["quick_gelu", "gelu", "swi_glu"]
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout, proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == "swi_glu":
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim),
+                nn.Dropout(proj_dropout),
+            )
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mlp_ratio,
+        num_heads,
+        activation="gelu",
+        proj_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == "quick_gelu" else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim),
+            nn.Dropout(proj_dropout),
+        )
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n * d).expand(b, -1, -1)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+        # compute attention
+        x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=16,
+        dim=768,
+        mlp_ratio=4,
+        out_dim=512,
+        num_heads=12,
+        num_layers=12,
+        pool_type="token",
+        pre_norm=True,
+        post_norm=False,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        if image_size % patch_size != 0:
+            print("[WARNING] image_size is not divisible by patch_size", flush=True)
+        assert pool_type in ("token", "token_fc", "attn_pool")
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3, dim, kernel_size=patch_size, stride=patch_size, bias=not pre_norm
+        )
+        if pool_type in ("token", "token_fc"):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(
+            gain
+            * torch.randn(
+                1,
+                self.num_patches + (1 if pool_type in ("token", "token_fc") else 0),
+                dim,
+            )
+        )
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(
+            *[
+                AttentionBlock(
+                    dim,
+                    mlp_ratio,
+                    num_heads,
+                    post_norm,
+                    False,
+                    activation,
+                    attn_dropout,
+                    proj_dropout,
+                    norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == "token":
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == "token_fc":
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == "attn_pool":
+            self.head = AttentionPool(
+                dim, mlp_ratio, num_heads, activation, proj_dropout, norm_eps
+            )
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ("token", "token_fc"):
+            x = torch.cat(
+                [
+                    self.cls_embedding.expand(b, -1, -1).to(
+                        dtype=x.dtype, device=x.device
+                    ),
+                    x,
+                ],
+                dim=1,
+            )
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        e = e.to(dtype=x.dtype, device=x.device)
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class CLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim=512,
+        image_size=224,
+        patch_size=16,
+        vision_dim=768,
+        vision_mlp_ratio=4,
+        vision_heads=12,
+        vision_layers=12,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        vocab_size=49408,
+        text_len=77,
+        text_dim=512,
+        text_mlp_ratio=4,
+        text_heads=8,
+        text_layers=12,
+        text_causal=True,
+        text_pool="argmax",
+        text_head_bias=False,
+        logit_bias=None,
+        activation="quick_gelu",
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pool = vision_pool
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_mlp_ratio = text_mlp_ratio
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_causal = text_causal
+        self.text_pool = text_pool
+        self.text_head_bias = text_head_bias
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            mlp_ratio=text_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            causal=text_causal,
+            pool_type=text_pool,
+            head_bias=text_head_bias,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+        if logit_bias is not None:
+            self.logit_bias = nn.Parameter(logit_bias * torch.ones([]))
+        # initialize weights
+        self.init_weights()
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long. Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, std=0.1)
+        # attentions
+        for modality in ["visual", "textual"]:
+            dim = self.vision_dim if modality == "visual" else self.text_dim
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (1.0 / math.sqrt(2 * len(transformer)))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+    def param_groups(self):
+        groups = [
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if "norm" in n or n.endswith("bias")
+                ],
+                "weight_decay": 0.0,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if not ("norm" in n or n.endswith("bias"))
+                ]
+            },
+        ]
+        return groups
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop("out_dim")
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False),
+            nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False),
+        )
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        vision_pre_norm=True,
+        vision_post_norm=False,
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps,
+        )
+        self.textual = None
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if "norm" in n or n.endswith("bias")
+                ],
+                "weight_decay": 0.0,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in self.named_parameters()
+                    if not ("norm" in n or n.endswith("bias"))
+                ]
+            },
+        ]
+        return groups
+def _clip(
+    pretrained=False,
+    pretrained_name=None,
+    model_cls=CLIP,
+    return_transforms=False,
+    return_tokenizer=False,
+    tokenizer_padding="eos",
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # init model
+    if pretrained and pretrained_name:
+        from sora import BUCKET, DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device("meta"):
+            model = model_cls(**kwargs)
+        # checkpoint path
+        checkpoint = f"models/clip/{pretrained_name}"
+        if dtype in (torch.float16, torch.bfloat16):
+            suffix = "-" + {torch.float16: "fp16", torch.bfloat16: "bf16"}[dtype]
+            if object_exists(BUCKET, f"{checkpoint}{suffix}.pth"):
+                checkpoint = f"{checkpoint}{suffix}"
+        checkpoint += ".pth"
+        # load
+        model.load_state_dict(
+            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device),
+            assign=True,
+            strict=False,
+        )
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = model_cls(**kwargs)
+    # set device
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if "siglip" in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose(
+            [
+                T.Resize(
+                    (model.image_size, model.image_size),
+                    interpolation=T.InterpolationMode.BICUBIC,
+                ),
+                T.ToTensor(),
+                T.Normalize(mean=mean, std=std),
+            ]
+        )
+        output += (transforms,)
+    # init tokenizer
+    if return_tokenizer:
+        from sora import data
+        if "siglip" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name=f"timm/{pretrained_name}",
+                seq_len=model.text_len,
+                clean="canonicalize",
+            )
+        elif "xlm" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name="xlm-roberta-large",
+                seq_len=model.max_text_len - 2,
+                clean="whitespace",
+            )
+        elif "mba" in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name="facebook/xlm-roberta-xl",
+                seq_len=model.max_text_len - 2,
+                clean="whitespace",
+            )
+        else:
+            tokenizer = data.CLIPTokenizer(
+                seq_len=model.text_len, padding=tokenizer_padding
+            )
+        output += (tokenizer,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+    pretrained=False,
+    pretrained_name="open-clip-xlm-roberta-large-vit-huge-14",
+    **kwargs,
+):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool="token",
+        activation="gelu",
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class WanImageEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=torch.float32,
+            device="cpu",
+        )
+    def encode_image(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat(
+            [
+                F.interpolate(u, size=size, mode="bicubic", align_corners=False)
+                for u in videos
+            ]
+        )
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        dtype = next(iter(self.model.visual.parameters())).dtype
+        videos = videos.to(dtype)
+        out = self.model.visual(videos, use_31_block=True)
+        return out
+    @staticmethod
+    def state_dict_converter():
+        return WanImageEncoderStateDictConverter()
+class WanImageEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.startswith("textual."):
+                continue
+            name = "model." + name
+            state_dict_[name] = param
+        return state_dict_

models/wan_video_motion_controller.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+from .wan_video_dit import sinusoidal_embedding_1d
+class WanMotionControllerModel(torch.nn.Module):
+    def __init__(self, freq_dim=256, dim=1536):
+        super().__init__()
+        self.freq_dim = freq_dim
+        self.linear = nn.Sequential(
+            nn.Linear(freq_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim * 6),
+        )
+    def forward(self, motion_bucket_id):
+        emb = sinusoidal_embedding_1d(self.freq_dim, motion_bucket_id * 10)
+        emb = self.linear(emb)
+        return emb
+    def init(self):
+        state_dict = self.linear[-1].state_dict()
+        state_dict = {i: state_dict[i] * 0 for i in state_dict}
+        self.linear[-1].load_state_dict(state_dict)
+    @staticmethod
+    def state_dict_converter():
+        return WanMotionControllerModelDictConverter()
+class WanMotionControllerModelDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

models/wan_video_text_encoder.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = (
+            None
+            if shared_pos
+            else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+        )
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(
+            lq, device=device
+        ).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact)
+                / math.log(self.max_dist / max_exact)
+                * (num_buckets - max_exact)
+            ).long()
+        )
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1)
+        )
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5
+        )
+class WanTextEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        vocab=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        num_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1,
+    ):
+        super(WanTextEncoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = (
+            vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        )
+        self.pos_embedding = (
+            T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+            if shared_pos
+            else None
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5SelfAttention(
+                    dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanTextEncoderStateDictConverter()
+class WanTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

models/wan_video_vace.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+from .wan_video_dit import DiTBlock
+from .utils import hash_state_dict_keys
+class VaceWanAttentionBlock(DiTBlock):
+    def __init__(self, has_image_input, dim, num_heads, ffn_dim, eps=1e-6, block_id=0):
+        super().__init__(has_image_input, dim, num_heads, ffn_dim, eps=eps)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = torch.nn.Linear(self.dim, self.dim)
+        self.after_proj = torch.nn.Linear(self.dim, self.dim)
+    def forward(self, c, x, context, t_mod, freqs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+        c, _ = super().forward(c, context, t_mod, freqs)
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+class VaceWanModel(torch.nn.Module):
+    def __init__(
+        self,
+        vace_layers=(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28),
+        vace_in_dim=96,
+        patch_size=(1, 2, 2),
+        has_image_input=False,
+        dim=1536,
+        num_heads=12,
+        ffn_dim=8960,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.vace_layers = vace_layers
+        self.vace_in_dim = vace_in_dim
+        self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
+        # vace blocks
+        self.vace_blocks = torch.nn.ModuleList(
+            [
+                VaceWanAttentionBlock(
+                    has_image_input, dim, num_heads, ffn_dim, eps, block_id=i
+                )
+                for i in self.vace_layers
+            ]
+        )
+        # vace patch embeddings
+        self.vace_patch_embedding = torch.nn.Conv3d(
+            vace_in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(
+        self,
+        x,
+        vace_context,
+        context,
+        t_mod,
+        freqs,
+        use_gradient_checkpointing: bool = False,
+        use_gradient_checkpointing_offload: bool = False,
+    ):
+        c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+        c = [u.flatten(2).transpose(1, 2) for u in c]
+        c = torch.cat(
+            [
+                torch.cat([u, u.new_zeros(1, x.shape[1] - u.size(1), u.size(2))], dim=1)
+                for u in c
+            ]
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block in self.vace_blocks:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    c = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        c,
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                c = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    c,
+                    x,
+                    context,
+                    t_mod,
+                    freqs,
+                    use_reentrant=False,
+                )
+            else:
+                c = block(c, x, context, t_mod, freqs)
+        hints = torch.unbind(c)[:-1]
+        return hints
+    @staticmethod
+    def state_dict_converter():
+        return VaceWanModelDictConverter()
+class VaceWanModelDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {
+            name: param for name, param in state_dict.items() if name.startswith("vace")
+        }
+        if (
+            hash_state_dict_keys(state_dict_) == "3b2726384e4f64837bdf216eea3f310d"
+        ):  # vace 14B
+            config = {
+                "vace_layers": (0, 5, 10, 15, 20, 25, 30, 35),
+                "vace_in_dim": 96,
+                "patch_size": (1, 2, 2),
+                "has_image_input": False,
+                "dim": 5120,
+                "num_heads": 40,
+                "ffn_dim": 13824,
+                "eps": 1e-06,
+            }
+        else:
+            config = {}
+        return state_dict_, config

models/wan_video_vae.py ADDED Viewed

	@@ -0,0 +1,1634 @@

+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+CACHE_T = 2
+def check_is_instance(model, module_class):
+    if isinstance(model, module_class):
+        return True
+    if hasattr(model, "module") and isinstance(model.module, module_class):
+        return True
+    return False
+def block_causal_mask(x, block_size):
+    # params
+    b, n, s, _, device = *x.size(), x.device
+    assert s % block_size == 0
+    num_blocks = s // block_size
+    # build mask
+    mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
+    for i in range(num_blocks):
+        mask[:, :, i * block_size : (i + 1) * block_size, : (i + 1) * block_size] = 1
+    return mask
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            self.padding[2],
+            self.padding[2],
+            self.padding[1],
+            self.padding[1],
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return (
+            F.normalize(x, dim=(1 if self.channel_first else -1))
+            * self.scale
+            * self.gamma
+            + self.bias
+        )
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
+            )
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] != "Rep"
+                    ):
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1, :, :]
+                                .unsqueeze(2)
+                                .to(cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] == "Rep"
+                    ):
+                        cache_x = torch.cat(
+                            [torch.zeros_like(cache_x).to(cache_x.device), cache_x],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)
+                    )
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x, "b c f (h q) (w r) -> b (c r q) f h w", q=patch_size, r=patch_size
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x, "b (c r q) f h w -> b c f (h q) (w r)", q=patch_size, r=patch_size
+        )
+    return x
+class Resample38(Resample):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super(Resample, self).__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
+            )
+        else:
+            self.resample = nn.Identity()
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = (
+            CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = (
+            self.to_qkv(x)
+            .reshape(b * t, 1, c * 3, -1)
+            .permute(0, 1, 3, 2)
+            .contiguous()
+            .chunk(3, dim=-1)
+        )
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            # attn_mask=block_causal_mask(q, block_size=h * w)
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+class Down_ResidualBlock(nn.Module):
+    def __init__(
+        self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False
+    ):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            downsamples.append(Resample38(out_dim, mode=mode))
+        self.downsamples = nn.Sequential(*downsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+        return x + self.avg_shortcut(x_copy)
+class Up_ResidualBlock(nn.Module):
+    def __init__(
+        self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False
+    ):
+        super().__init__()
+        # Shortcut path with upsample
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2 if up_flag else 1,
+            )
+        else:
+            self.avg_shortcut = None
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final upsample block
+        if up_flag:
+            mode = "upsample3d" if temperal_upsample else "upsample2d"
+            upsamples.append(Resample38(out_dim, mode=mode))
+        self.upsamples = nn.Sequential(*upsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Encoder3d_38(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = (
+                temperal_downsample[i] if i < len(temperal_downsample) else False
+            )
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                    down_flag=i != len(dim_mult) - 1,
+                )
+            )
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d_38(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks + 1,
+                    temperal_upsample=t_up_flag,
+                    up_flag=i != len(dim_mult) - 1,
+                )
+            )
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 12, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class VideoVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=96,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(
+            dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)  # may add tensor offload
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+class WanVideoVAE(nn.Module):
+    def __init__(self, z_dim=16):
+        super().__init__()
+        mean = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ]
+        std = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
+        self.upsampling_factor = 8
+        self.z_dim = z_dim
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,)
+            )
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, _, H, W = data.shape
+        h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
+        w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
+        h = repeat(h, "H -> H W", H=H, W=W)
+        w = repeat(w, "W -> H W", H=H, W=W)
+        mask = torch.stack([h, w]).min(dim=0).values
+        mask = rearrange(mask, "H W -> 1 1 1 H W")
+        return mask
+    def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
+        _, _, T, H, W = hidden_states.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if h - stride_h >= 0 and h - stride_h + size_h >= H:
+                continue
+            for w in range(0, W, stride_w):
+                if w - stride_w >= 0 and w - stride_w + size_w >= W:
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = T * 4 - 3
+        weight = torch.zeros(
+            (1, 1, out_T, H * self.upsampling_factor, W * self.upsampling_factor),
+            dtype=hidden_states.dtype,
+            device=data_device,
+        )
+        values = torch.zeros(
+            (1, 3, out_T, H * self.upsampling_factor, W * self.upsampling_factor),
+            dtype=hidden_states.dtype,
+            device=data_device,
+        )
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding"):
+            hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(
+                computation_device
+            )
+            hidden_states_batch = self.model.decode(hidden_states_batch, self.scale).to(
+                data_device
+            )
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=(
+                    (size_h - stride_h) * self.upsampling_factor,
+                    (size_w - stride_w) * self.upsampling_factor,
+                ),
+            ).to(dtype=hidden_states.dtype, device=data_device)
+            target_h = h * self.upsampling_factor
+            target_w = w * self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.clamp_(-1, 1)
+        return values
+    def tiled_encode(self, video, device, tile_size, tile_stride):
+        _, _, T, H, W = video.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if h - stride_h >= 0 and h - stride_h + size_h >= H:
+                continue
+            for w in range(0, W, stride_w):
+                if w - stride_w >= 0 and w - stride_w + size_w >= W:
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = (T + 3) // 4
+        weight = torch.zeros(
+            (1, 1, out_T, H // self.upsampling_factor, W // self.upsampling_factor),
+            dtype=video.dtype,
+            device=data_device,
+        )
+        values = torch.zeros(
+            (
+                1,
+                self.z_dim,
+                out_T,
+                H // self.upsampling_factor,
+                W // self.upsampling_factor,
+            ),
+            dtype=video.dtype,
+            device=data_device,
+        )
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding"):
+            hidden_states_batch = video[:, :, :, h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.encode(hidden_states_batch, self.scale).to(
+                data_device
+            )
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=(
+                    (size_h - stride_h) // self.upsampling_factor,
+                    (size_w - stride_w) // self.upsampling_factor,
+                ),
+            ).to(dtype=video.dtype, device=data_device)
+            target_h = h // self.upsampling_factor
+            target_w = w // self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h : target_h + hidden_states_batch.shape[3],
+                target_w : target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        return values
+    def single_encode(self, video, device):
+        video = video.to(device)
+        x = self.model.encode(video, self.scale)
+        return x
+    def single_decode(self, hidden_state, device):
+        hidden_state = hidden_state.to(device)
+        video = self.model.decode(hidden_state, self.scale)
+        return video.clamp_(-1, 1)
+    def encode(
+        self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)
+    ):
+        videos = [video.to("cpu") for video in videos]
+        hidden_states = []
+        for video in videos:
+            video = video.unsqueeze(0)
+            if tiled:
+                tile_size = (
+                    tile_size[0] * self.upsampling_factor,
+                    tile_size[1] * self.upsampling_factor,
+                )
+                tile_stride = (
+                    tile_stride[0] * self.upsampling_factor,
+                    tile_stride[1] * self.upsampling_factor,
+                )
+                hidden_state = self.tiled_encode(video, device, tile_size, tile_stride)
+            else:
+                hidden_state = self.single_encode(video, device)
+            hidden_state = hidden_state.squeeze(0)
+            hidden_states.append(hidden_state)
+        hidden_states = torch.stack(hidden_states)
+        return hidden_states
+    def decode(
+        self,
+        hidden_states,
+        device,
+        tiled=False,
+        tile_size=(34, 34),
+        tile_stride=(18, 16),
+    ):
+        if tiled:
+            video = self.tiled_decode(hidden_states, device, tile_size, tile_stride)
+        else:
+            video = self.single_decode(hidden_states, device)
+        return video
+    @staticmethod
+    def state_dict_converter():
+        return WanVideoVAEStateDictConverter()
+class WanVideoVAEStateDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        if "model_state" in state_dict:
+            state_dict = state_dict["model_state"]
+        for name in state_dict:
+            state_dict_["model." + name] = state_dict[name]
+        return state_dict_
+class VideoVAE38_(VideoVAE_):
+    def __init__(
+        self,
+        dim=160,
+        z_dim=48,
+        dec_dim=256,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super(VideoVAE_, self).__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d_38(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d_38(
+            dec_dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+    def encode(self, x, scale):
+        self.clear_cache()
+        x = patchify(x, patch_size=2)
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=True,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        out = unpatchify(out, patch_size=2)
+        self.clear_cache()
+        return out
+class WanVideoVAE38(WanVideoVAE):
+    def __init__(self, z_dim=48, dim=160):
+        super(WanVideoVAE, self).__init__()
+        mean = [
+            -0.2289,
+            -0.0052,
+            -0.1323,
+            -0.2339,
+            -0.2799,
+            0.0174,
+            0.1838,
+            0.1557,
+            -0.1382,
+            0.0542,
+            0.2813,
+            0.0891,
+            0.1570,
+            -0.0098,
+            0.0375,
+            -0.1825,
+            -0.2246,
+            -0.1207,
+            -0.0698,
+            0.5109,
+            0.2665,
+            -0.2108,
+            -0.2158,
+            0.2502,
+            -0.2055,
+            -0.0322,
+            0.1109,
+            0.1567,
+            -0.0729,
+            0.0899,
+            -0.2799,
+            -0.1230,
+            -0.0313,
+            -0.1649,
+            0.0117,
+            0.0723,
+            -0.2839,
+            -0.2083,
+            -0.0520,
+            0.3748,
+            0.0152,
+            0.1957,
+            0.1433,
+            -0.2944,
+            0.3573,
+            -0.0548,
+            -0.1681,
+            -0.0667,
+        ]
+        std = [
+            0.4765,
+            1.0364,
+            0.4514,
+            1.1677,
+            0.5313,
+            0.4990,
+            0.4818,
+            0.5013,
+            0.8158,
+            1.0344,
+            0.5894,
+            1.0901,
+            0.6885,
+            0.6165,
+            0.8454,
+            0.4978,
+            0.5759,
+            0.3523,
+            0.7135,
+            0.6804,
+            0.5833,
+            1.4146,
+            0.8986,
+            0.5659,
+            0.7069,
+            0.5338,
+            0.4889,
+            0.4917,
+            0.4069,
+            0.4999,
+            0.6866,
+            0.4093,
+            0.5709,
+            0.6065,
+            0.6415,
+            0.4944,
+            0.5726,
+            1.2042,
+            0.5458,
+            1.6887,
+            0.3971,
+            1.0600,
+            0.3943,
+            0.5537,
+            0.5444,
+            0.4089,
+            0.7468,
+            0.7744,
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE38_(z_dim=z_dim, dim=dim).eval().requires_grad_(False)
+        self.upsampling_factor = 16
+        self.z_dim = z_dim

pipelines/base.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import numpy as np
+from PIL import Image
+from torchvision.transforms import GaussianBlur
+class BasePipeline(torch.nn.Module):
+    def __init__(
+        self,
+        device="cuda",
+        torch_dtype=torch.float16,
+        height_division_factor=64,
+        width_division_factor=64,
+    ):
+        super().__init__()
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.cpu_offload = False
+        self.model_names = []
+    def check_resize_height_width(self, height, width):
+        if height % self.height_division_factor != 0:
+            height = (
+                (height + self.height_division_factor - 1)
+                // self.height_division_factor
+                * self.height_division_factor
+            )
+            print(
+                f"The height cannot be evenly divided by {self.height_division_factor}. We round it up to {height}."
+            )
+        if width % self.width_division_factor != 0:
+            width = (
+                (width + self.width_division_factor - 1)
+                // self.width_division_factor
+                * self.width_division_factor
+            )
+            print(
+                f"The width cannot be evenly divided by {self.width_division_factor}. We round it up to {width}."
+            )
+        return height, width
+    def preprocess_image(self, image):
+        image = (
+            torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1)
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+        )
+        return image
+    def preprocess_images(self, images):
+        return [self.preprocess_image(image) for image in images]
+    def vae_output_to_image(self, vae_output):
+        image = vae_output[0].cpu().float().permute(1, 2, 0).numpy()
+        image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+        return image
+    def vae_output_to_video(self, vae_output):
+        video = vae_output.cpu().permute(1, 2, 0).numpy()
+        video = [
+            Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+            for image in video
+        ]
+        return video
+    def merge_latents(
+        self, value, latents, masks, scales, blur_kernel_size=33, blur_sigma=10.0
+    ):
+        if len(latents) > 0:
+            blur = GaussianBlur(kernel_size=blur_kernel_size, sigma=blur_sigma)
+            height, width = value.shape[-2:]
+            weight = torch.ones_like(value)
+            for latent, mask, scale in zip(latents, masks, scales):
+                mask = (
+                    self.preprocess_image(mask.resize((width, height))).mean(
+                        dim=1, keepdim=True
+                    )
+                    > 0
+                )
+                mask = mask.repeat(1, latent.shape[1], 1, 1).to(
+                    dtype=latent.dtype, device=latent.device
+                )
+                mask = blur(mask)
+                value += latent * mask * scale
+                weight += mask * scale
+            value /= weight
+        return value
+    def control_noise_via_local_prompts(
+        self,
+        prompt_emb_global,
+        prompt_emb_locals,
+        masks,
+        mask_scales,
+        inference_callback,
+        special_kwargs=None,
+        special_local_kwargs_list=None,
+    ):
+        if special_kwargs is None:
+            noise_pred_global = inference_callback(prompt_emb_global)
+        else:
+            noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
+        if special_local_kwargs_list is None:
+            noise_pred_locals = [
+                inference_callback(prompt_emb_local)
+                for prompt_emb_local in prompt_emb_locals
+            ]
+        else:
+            noise_pred_locals = [
+                inference_callback(prompt_emb_local, special_kwargs)
+                for prompt_emb_local, special_kwargs in zip(
+                    prompt_emb_locals, special_local_kwargs_list
+                )
+            ]
+        noise_pred = self.merge_latents(
+            noise_pred_global, noise_pred_locals, masks, mask_scales
+        )
+        return noise_pred
+    def extend_prompt(self, prompt, local_prompts, masks, mask_scales):
+        local_prompts = local_prompts or []
+        masks = masks or []
+        mask_scales = mask_scales or []
+        extended_prompt_dict = self.prompter.extend_prompt(prompt)
+        prompt = extended_prompt_dict.get("prompt", prompt)
+        local_prompts += extended_prompt_dict.get("prompts", [])
+        masks += extended_prompt_dict.get("masks", [])
+        mask_scales += [100.0] * len(extended_prompt_dict.get("masks", []))
+        return prompt, local_prompts, masks, mask_scales
+    def enable_cpu_offload(self):
+        self.cpu_offload = True
+    def load_models_to_device(self, loadmodel_names=[]):
+        # only load models to device if cpu_offload is enabled
+        if not self.cpu_offload:
+            return
+        # offload the unneeded models to cpu
+        for model_name in self.model_names:
+            if model_name not in loadmodel_names:
+                model = getattr(self, model_name)
+                if model is not None:
+                    if (
+                        hasattr(model, "vram_management_enabled")
+                        and model.vram_management_enabled
+                    ):
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+        # load the needed models to device
+        for model_name in loadmodel_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                if (
+                    hasattr(model, "vram_management_enabled")
+                    and model.vram_management_enabled
+                ):
+                    for module in model.modules():
+                        if hasattr(module, "onload"):
+                            module.onload()
+                else:
+                    model.to(self.device)
+        # fresh the cuda cache
+        torch.cuda.empty_cache()
+    def generate_noise(self, shape, seed=None, device="cpu", dtype=torch.float16):
+        generator = None if seed is None else torch.Generator(device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        return noise

pipelines/wan_video.py ADDED Viewed

	@@ -0,0 +1,1793 @@

+import torch, types
+import numpy as np
+from PIL import Image
+from einops import repeat
+from typing import Optional, Union
+from einops import rearrange
+import numpy as np
+from tqdm import tqdm
+from typing import Optional
+from typing_extensions import Literal
+import imageio
+import os
+from typing import List, Tuple
+import PIL
+from utils import BasePipeline, ModelConfig, PipelineUnit, PipelineUnitRunner
+from models import ModelManager, load_state_dict
+from models.wan_video_dit import WanModel, RMSNorm, sinusoidal_embedding_1d
+from models.wan_video_text_encoder import (
+    WanTextEncoder,
+    T5RelativeEmbedding,
+    T5LayerNorm,
+)
+from models.wan_video_vae import WanVideoVAE, RMS_norm, CausalConv3d, Upsample
+from models.wan_video_image_encoder import WanImageEncoder
+from models.wan_video_vace import VaceWanModel
+from models.wan_video_motion_controller import WanMotionControllerModel
+from schedulers.flow_match import FlowMatchScheduler
+from prompters import WanPrompter
+from vram_management import (
+    enable_vram_management,
+    AutoWrappedModule,
+    AutoWrappedLinear,
+    WanAutoCastLayerNorm,
+)
+from lora import GeneralLoRALoader
+def load_video_as_list(video_path: str) -> Tuple[List[Image.Image], int, int, int]:
+    if not os.path.isfile(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    reader = imageio.get_reader(video_path)
+    meta_data = reader.get_meta_data()
+    original_width = meta_data['size'][0]
+    original_height = meta_data['size'][1]
+    new_width = (original_width // 16) * 16
+    new_height = (original_height // 16) * 16
+    left = (original_width - new_width) // 2
+    top = (original_height - new_height) // 2
+    right = left + new_width
+    bottom = top + new_height
+    crop_box = (left, top, right, bottom)
+    original_frame_count = reader.count_frames()
+    new_frame_count = original_frame_count - ((original_frame_count - 1) % 4)
+    frames = []
+    for i in range(new_frame_count):
+        try:
+            frame_data = reader.get_data(i)
+            pil_image = Image.fromarray(frame_data)
+            cropped_image = pil_image.crop(crop_box)
+            frames.append(cropped_image)
+        except IndexError:
+            print(f"Warning: Actual number of frames is less than expected. Stopping at frame {i}.")
+            new_frame_count = len(frames)
+            break
+    reader.close()
+    return frames, new_width, new_height, new_frame_count
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.bfloat16, tokenizer_path=None):
+        super().__init__(
+            device=device,
+            torch_dtype=torch_dtype,
+            height_division_factor=16,
+            width_division_factor=16,
+            time_division_factor=4,
+            time_division_remainder=1,
+        )
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        self.dit: WanModel = None
+        self.dit2: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.motion_controller: WanMotionControllerModel = None
+        self.vace: VaceWanModel = None
+        self.in_iteration_models = ("dit", "motion_controller", "vace")
+        self.in_iteration_models_2 = ("dit2", "motion_controller", "vace")
+        self.unit_runner = PipelineUnitRunner()
+        self.units = [
+            WanVideoUnit_ShapeChecker(),
+            WanVideoUnit_NoiseInitializer(),
+            WanVideoUnit_InputVideoEmbedder(),
+            WanVideoUnit_PromptEmbedder(),
+            WanVideoUnit_ImageEmbedderVAE(),
+            WanVideoUnit_ImageEmbedderCLIP(),
+            WanVideoUnit_ImageEmbedderFused(),
+            WanVideoUnit_FunControl(),
+            WanVideoUnit_FunReference(),
+            WanVideoUnit_FunCameraControl(),
+            WanVideoUnit_SpeedControl(),
+            WanVideoUnit_VACE(),
+            WanVideoUnit_UnifiedSequenceParallel(),
+            WanVideoUnit_TeaCache(),
+            WanVideoUnit_CfgMerger(),
+        ]
+        self.model_fn = model_fn_wan_video
+    def encode_ip_image(self, ip_image):
+        self.load_models_to_device(["vae"])
+        ip_image = (
+            torch.tensor(np.array(ip_image)).permute(2, 0, 1).float() / 255.0
+        )  # [3, H, W]
+        ip_image = (
+            ip_image.unsqueeze(1).unsqueeze(0).to(dtype=self.torch_dtype)
+        )  # [B, 3, 1, H, W]
+        ip_image = ip_image * 2 - 1
+        ip_image_latent = self.vae.encode(ip_image, device=self.device, tiled=False)
+        return ip_image_latent
+    def load_lora(self, module, path, alpha=1):
+        loader = GeneralLoRALoader(torch_dtype=self.torch_dtype, device=self.device)
+        lora = load_state_dict(path, torch_dtype=self.torch_dtype, device=self.device)
+        loader.load(module, lora, alpha=alpha)
+    def training_loss(self, **inputs):
+        max_timestep_boundary = int(
+            inputs.get("max_timestep_boundary", 1) * self.scheduler.num_train_timesteps
+        )
+        min_timestep_boundary = int(
+            inputs.get("min_timestep_boundary", 0) * self.scheduler.num_train_timesteps
+        )
+        timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
+        timestep = self.scheduler.timesteps[timestep_id].to(
+            dtype=self.torch_dtype, device=self.device
+        )
+        inputs["latents"] = self.scheduler.add_noise(
+            inputs["input_latents"], inputs["noise"], timestep
+        )
+        training_target = self.scheduler.training_target(
+            inputs["input_latents"], inputs["noise"], timestep
+        )
+        noise_pred = self.model_fn(**inputs, timestep=timestep)
+        loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
+        loss = loss * self.scheduler.training_weight(timestep)
+        return loss
+    def enable_vram_management(
+        self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5
+    ):
+        self.vram_management_enabled = True
+        if num_persistent_param_in_dit is not None:
+            vram_limit = None
+        else:
+            if vram_limit is None:
+                vram_limit = self.get_vram()
+            vram_limit = vram_limit - vram_buffer
+        if self.text_encoder is not None:
+            dtype = next(iter(self.text_encoder.parameters())).dtype
+            enable_vram_management(
+                self.text_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Embedding: AutoWrappedModule,
+                    T5RelativeEmbedding: AutoWrappedModule,
+                    T5LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.dit is not None:
+            dtype = next(iter(self.dit.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.dit2 is not None:
+            dtype = next(iter(self.dit2.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit2,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.vae is not None:
+            dtype = next(iter(self.vae.parameters())).dtype
+            enable_vram_management(
+                self.vae,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    RMS_norm: AutoWrappedModule,
+                    CausalConv3d: AutoWrappedModule,
+                    Upsample: AutoWrappedModule,
+                    torch.nn.SiLU: AutoWrappedModule,
+                    torch.nn.Dropout: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=self.device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.motion_controller is not None:
+            dtype = next(iter(self.motion_controller.parameters())).dtype
+            enable_vram_management(
+                self.motion_controller,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.vace is not None:
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.vace,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+    def initialize_usp(self):
+        import torch.distributed as dist
+        from xfuser.core.distributed import (
+            initialize_model_parallel,
+            init_distributed_environment,
+        )
+        dist.init_process_group(backend="nccl", init_method="env://")
+        init_distributed_environment(
+            rank=dist.get_rank(), world_size=dist.get_world_size()
+        )
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+        torch.cuda.set_device(dist.get_rank())
+    def enable_usp(self):
+        from xfuser.core.distributed import get_sequence_parallel_world_size
+        from distributed.xdit_context_parallel import (
+            usp_attn_forward,
+            usp_dit_forward,
+        )
+        for block in self.dit.blocks:
+            block.self_attn.forward = types.MethodType(
+                usp_attn_forward, block.self_attn
+            )
+        self.dit.forward = types.MethodType(usp_dit_forward, self.dit)
+        if self.dit2 is not None:
+            for block in self.dit2.blocks:
+                block.self_attn.forward = types.MethodType(
+                    usp_attn_forward, block.self_attn
+                )
+            self.dit2.forward = types.MethodType(usp_dit_forward, self.dit2)
+        self.sp_size = get_sequence_parallel_world_size()
+        self.use_unified_sequence_parallel = True
+    @staticmethod
+    def from_pretrained(
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device: Union[str, torch.device] = "cuda",
+        model_configs: list[ModelConfig] = [],
+        tokenizer_config: ModelConfig = ModelConfig(
+            model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"
+        ),
+        redirect_common_files: bool = True,
+        use_usp=False,
+    ):
+        # Redirect model path
+        if redirect_common_files:
+            redirect_dict = {
+                "models_t5_umt5-xxl-enc-bf16.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "Wan2.1_VAE.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth": "Wan-AI/Wan2.1-I2V-14B-480P",
+            }
+            for model_config in model_configs:
+                if (
+                    model_config.origin_file_pattern is None
+                    or model_config.model_id is None
+                ):
+                    continue
+                if (
+                    model_config.origin_file_pattern in redirect_dict
+                    and model_config.model_id
+                    != redirect_dict[model_config.origin_file_pattern]
+                ):
+                    print(
+                        f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to ({redirect_dict[model_config.origin_file_pattern]}, {model_config.origin_file_pattern}). You can use `redirect_common_files=False` to disable file redirection."
+                    )
+                    model_config.model_id = redirect_dict[
+                        model_config.origin_file_pattern
+                    ]
+        # Initialize pipeline
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        if use_usp:
+            pipe.initialize_usp()
+        # Download and load models
+        model_manager = ModelManager()
+        for model_config in model_configs:
+            model_config.download_if_necessary(use_usp=use_usp)
+            model_manager.load_model(
+                model_config.path,
+                device=model_config.offload_device or device,
+                torch_dtype=model_config.offload_dtype or torch_dtype,
+            )
+        # Load models
+        pipe.text_encoder = model_manager.fetch_model("wan_video_text_encoder")
+        dit = model_manager.fetch_model("wan_video_dit", index=2)
+        if isinstance(dit, list):
+            pipe.dit, pipe.dit2 = dit
+        else:
+            pipe.dit = dit
+        pipe.vae = model_manager.fetch_model("wan_video_vae")
+        pipe.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+        pipe.motion_controller = model_manager.fetch_model(
+            "wan_video_motion_controller"
+        )
+        pipe.vace = model_manager.fetch_model("wan_video_vace")
+        # Size division factor
+        if pipe.vae is not None:
+            pipe.height_division_factor = pipe.vae.upsampling_factor * 2
+            pipe.width_division_factor = pipe.vae.upsampling_factor * 2
+        # Initialize tokenizer
+        tokenizer_config.download_if_necessary(use_usp=use_usp)
+        pipe.prompter.fetch_models(pipe.text_encoder)
+        pipe.prompter.fetch_tokenizer(tokenizer_config.path)
+        # Unified Sequence Parallel
+        if use_usp:
+            pipe.enable_usp()
+        return pipe
+    @torch.no_grad()
+    def __call__(
+        self,
+        # Prompt
+        prompt: str,
+        negative_prompt: Optional[str] = "",
+        # Image-to-video
+        input_image: Optional[Image.Image] = None,
+        # First-last-frame-to-video
+        end_image: Optional[Image.Image] = None,
+        # Video-to-video
+        input_video: Optional[list[Image.Image]] = None,
+        denoising_strength: Optional[float] = 1.0,
+        # ControlNet
+        control_video: Optional[list[Image.Image]] = None,
+        reference_image: Optional[Image.Image] = None,
+        # Camera control
+        camera_control_direction: Optional[
+            Literal[
+                "Left",
+                "Right",
+                "Up",
+                "Down",
+                "LeftUp",
+                "LeftDown",
+                "RightUp",
+                "RightDown",
+            ]
+        ] = None,
+        camera_control_speed: Optional[float] = 1 / 54,
+        camera_control_origin: Optional[tuple] = (
+            0,
+            0.532139961,
+            0.946026558,
+            0.5,
+            0.5,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+        ),
+        # VACE
+        vace_video: Optional[list[Image.Image]] = None,
+        vace_video_mask: Optional[Image.Image] = None,
+        vace_reference_image: Optional[Image.Image] = None,
+        vace_scale: Optional[float] = 1.0,
+        # Randomness
+        seed: Optional[int] = None,
+        rand_device: Optional[str] = "cpu",
+        # Shape
+        height: Optional[int] = 480,
+        width: Optional[int] = 832,
+        num_frames=81,
+        # Classifier-free guidance
+        cfg_scale: Optional[float] = 5.0,
+        cfg_merge: Optional[bool] = False,
+        # Boundary
+        switch_DiT_boundary: Optional[float] = 0.875,
+        # Scheduler
+        num_inference_steps: Optional[int] = 50,
+        sigma_shift: Optional[float] = 5.0,
+        # Speed control
+        motion_bucket_id: Optional[int] = None,
+        # VAE tiling
+        tiled: Optional[bool] = True,
+        tile_size: Optional[tuple[int, int]] = (30, 52),
+        tile_stride: Optional[tuple[int, int]] = (15, 26),
+        # Sliding window
+        sliding_window_size: Optional[int] = None,
+        sliding_window_stride: Optional[int] = None,
+        # Teacache
+        tea_cache_l1_thresh: Optional[float] = None,
+        tea_cache_model_id: Optional[str] = "",
+        # progress_bar
+        progress_bar_cmd=tqdm,
+        # Stand-In
+        ip_image=None,
+    ):
+        if ip_image is not None:
+            ip_image = self.encode_ip_image(ip_image)
+        if vace_video is not None:
+            vace_video, width, height, num_frames = load_video_as_list(vace_video)
+        if vace_reference_image is not None:
+            vace_reference_image = Image.open(vace_reference_image).convert('RGB')
+            ref_width, ref_height = vace_reference_image.size
+            if ref_width != width or ref_height != height:
+                scale_ratio = min(width / ref_width, height / ref_height)
+                new_ref_width = int(ref_width * scale_ratio)
+                new_ref_height = int(ref_height * scale_ratio)
+                resized_image = vace_reference_image.resize((new_ref_width, new_ref_height), Image.LANCZOS)
+                background = Image.new('RGB', (width, height), (255, 255, 255))
+                paste_x = (width - new_ref_width) // 2
+                paste_y = (height - new_ref_height) // 2
+                background.paste(resized_image, (paste_x, paste_y))
+                vace_reference_image = background
+        # Scheduler
+        self.scheduler.set_timesteps(
+            num_inference_steps,
+            denoising_strength=denoising_strength,
+            shift=sigma_shift,
+        )
+        # Inputs
+        inputs_posi = {
+            "prompt": prompt,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_nega = {
+            "negative_prompt": negative_prompt,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_shared = {
+            "input_image": input_image,
+            "end_image": end_image,
+            "input_video": input_video,
+            "denoising_strength": denoising_strength,
+            "control_video": control_video,
+            "reference_image": reference_image,
+            "camera_control_direction": camera_control_direction,
+            "camera_control_speed": camera_control_speed,
+            "camera_control_origin": camera_control_origin,
+            "vace_video": vace_video,
+            "vace_video_mask": vace_video_mask,
+            "vace_reference_image": vace_reference_image,
+            "vace_scale": vace_scale,
+            "seed": seed,
+            "rand_device": rand_device,
+            "height": height,
+            "width": width,
+            "num_frames": num_frames,
+            "cfg_scale": cfg_scale,
+            "cfg_merge": cfg_merge,
+            "sigma_shift": sigma_shift,
+            "motion_bucket_id": motion_bucket_id,
+            "tiled": tiled,
+            "tile_size": tile_size,
+            "tile_stride": tile_stride,
+            "sliding_window_size": sliding_window_size,
+            "sliding_window_stride": sliding_window_stride,
+            "ip_image": ip_image,
+        }
+        for unit in self.units:
+            inputs_shared, inputs_posi, inputs_nega = self.unit_runner(
+                unit, self, inputs_shared, inputs_posi, inputs_nega
+            )
+        # Denoise
+        self.load_models_to_device(self.in_iteration_models)
+        models = {name: getattr(self, name) for name in self.in_iteration_models}
+        for progress_id, timestep in enumerate(
+            progress_bar_cmd(self.scheduler.timesteps)
+        ):
+            # Switch DiT if necessary
+            if (
+                timestep.item()
+                < switch_DiT_boundary * self.scheduler.num_train_timesteps
+                and self.dit2 is not None
+                and not models["dit"] is self.dit2
+            ):
+                self.load_models_to_device(self.in_iteration_models_2)
+                models["dit"] = self.dit2
+            # Timestep
+            timestep = timestep.unsqueeze(0).to(
+                dtype=self.torch_dtype, device=self.device
+            )
+            # Inference
+            noise_pred_posi = self.model_fn(
+                **models, **inputs_shared, **inputs_posi, timestep=timestep
+            )
+            inputs_shared["ip_image"] = None
+            if cfg_scale != 1.0:
+                if cfg_merge:
+                    noise_pred_posi, noise_pred_nega = noise_pred_posi.chunk(2, dim=0)
+                else:
+                    noise_pred_nega = self.model_fn(
+                        **models, **inputs_shared, **inputs_nega, timestep=timestep
+                    )
+                noise_pred = noise_pred_nega + cfg_scale * (
+                    noise_pred_posi - noise_pred_nega
+                )
+            else:
+                noise_pred = noise_pred_posi
+            # Scheduler
+            inputs_shared["latents"] = self.scheduler.step(
+                noise_pred,
+                self.scheduler.timesteps[progress_id],
+                inputs_shared["latents"],
+            )
+            if "first_frame_latents" in inputs_shared:
+                inputs_shared["latents"][:, :, 0:1] = inputs_shared[
+                    "first_frame_latents"
+                ]
+        if vace_reference_image is not None:
+            inputs_shared["latents"] = inputs_shared["latents"][:, :, 1:]
+        # Decode
+        self.load_models_to_device(["vae"])
+        video = self.vae.decode(
+            inputs_shared["latents"],
+            device=self.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        video = self.vae_output_to_video(video)
+        self.load_models_to_device([])
+        return video
+class WanVideoUnit_ShapeChecker(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=("height", "width", "num_frames"))
+    def process(self, pipe: WanVideoPipeline, height, width, num_frames):
+        height, width, num_frames = pipe.check_resize_height_width(
+            height, width, num_frames
+        )
+        return {"height": height, "width": width, "num_frames": num_frames}
+class WanVideoUnit_NoiseInitializer(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "height",
+                "width",
+                "num_frames",
+                "seed",
+                "rand_device",
+                "vace_reference_image",
+            )
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        height,
+        width,
+        num_frames,
+        seed,
+        rand_device,
+        vace_reference_image,
+    ):
+        length = (num_frames - 1) // 4 + 1
+        if vace_reference_image is not None:
+            length += 1
+        shape = (
+            1,
+            pipe.vae.model.z_dim,
+            length,
+            height // pipe.vae.upsampling_factor,
+            width // pipe.vae.upsampling_factor,
+        )
+        noise = pipe.generate_noise(shape, seed=seed, rand_device=rand_device)
+        if vace_reference_image is not None:
+            noise = torch.concat((noise[:, :, -1:], noise[:, :, :-1]), dim=2)
+        return {"noise": noise}
+class WanVideoUnit_InputVideoEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_video",
+                "noise",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "vace_reference_image",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        input_video,
+        noise,
+        tiled,
+        tile_size,
+        tile_stride,
+        vace_reference_image,
+    ):
+        if input_video is None:
+            return {"latents": noise}
+        pipe.load_models_to_device(["vae"])
+        input_video = pipe.preprocess_video(input_video)
+        input_latents = pipe.vae.encode(
+            input_video,
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        ).to(dtype=pipe.torch_dtype, device=pipe.device)
+        if vace_reference_image is not None:
+            vace_reference_image = pipe.preprocess_video([vace_reference_image])
+            vace_reference_latents = pipe.vae.encode(
+                vace_reference_image, device=pipe.device
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            input_latents = torch.concat([vace_reference_latents, input_latents], dim=2)
+        if pipe.scheduler.training:
+            return {"latents": noise, "input_latents": input_latents}
+        else:
+            latents = pipe.scheduler.add_noise(
+                input_latents, noise, timestep=pipe.scheduler.timesteps[0]
+            )
+            return {"latents": latents}
+class WanVideoUnit_PromptEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={"prompt": "prompt", "positive": "positive"},
+            input_params_nega={"prompt": "negative_prompt", "positive": "positive"},
+            onload_model_names=("text_encoder",),
+        )
+    def process(self, pipe: WanVideoPipeline, prompt, positive) -> dict:
+        pipe.load_models_to_device(self.onload_model_names)
+        prompt_emb = pipe.prompter.encode_prompt(
+            prompt, positive=positive, device=pipe.device
+        )
+        return {"context": prompt_emb}
+class WanVideoUnit_ImageEmbedder(PipelineUnit):
+    """
+    Deprecated
+    """
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "end_image",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("image_encoder", "vae"),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        input_image,
+        end_image,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or pipe.image_encoder is None:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        clip_context = pipe.image_encoder.encode_image([image])
+        msk = torch.ones(1, num_frames, height // 8, width // 8, device=pipe.device)
+        msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 2, height, width).to(image.device),
+                    end_image.transpose(0, 1),
+                ],
+                dim=1,
+            )
+            if pipe.dit.has_image_pos_emb:
+                clip_context = torch.concat(
+                    [clip_context, pipe.image_encoder.encode_image([end_image])], dim=1
+                )
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 1, height, width).to(image.device),
+                ],
+                dim=1,
+            )
+        msk = torch.concat(
+            [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1
+        )
+        msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+        msk = msk.transpose(1, 2)[0]
+        y = pipe.vae.encode(
+            [vae_input.to(dtype=pipe.torch_dtype, device=pipe.device)],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )[0]
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        clip_context = clip_context.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context, "y": y}
+class WanVideoUnit_ImageEmbedderCLIP(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("input_image", "end_image", "height", "width"),
+            onload_model_names=("image_encoder",),
+        )
+    def process(self, pipe: WanVideoPipeline, input_image, end_image, height, width):
+        if (
+            input_image is None
+            or pipe.image_encoder is None
+            or not pipe.dit.require_clip_embedding
+        ):
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        clip_context = pipe.image_encoder.encode_image([image])
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            if pipe.dit.has_image_pos_emb:
+                clip_context = torch.concat(
+                    [clip_context, pipe.image_encoder.encode_image([end_image])], dim=1
+                )
+        clip_context = clip_context.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context}
+class WanVideoUnit_ImageEmbedderVAE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "end_image",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        input_image,
+        end_image,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or not pipe.dit.require_vae_embedding:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        msk = torch.ones(1, num_frames, height // 8, width // 8, device=pipe.device)
+        msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 2, height, width).to(image.device),
+                    end_image.transpose(0, 1),
+                ],
+                dim=1,
+            )
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 1, height, width).to(image.device),
+                ],
+                dim=1,
+            )
+        msk = torch.concat(
+            [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1
+        )
+        msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+        msk = msk.transpose(1, 2)[0]
+        y = pipe.vae.encode(
+            [vae_input.to(dtype=pipe.torch_dtype, device=pipe.device)],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )[0]
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"y": y}
+class WanVideoUnit_ImageEmbedderFused(PipelineUnit):
+    """
+    Encode input image to latents using VAE. This unit is for Wan-AI/Wan2.2-TI2V-5B.
+    """
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "latents",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        input_image,
+        latents,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or not pipe.dit.fuse_vae_embedding_in_latents:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).transpose(
+            0, 1
+        )
+        z = pipe.vae.encode(
+            [image],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        latents[:, :, 0:1] = z
+        return {
+            "latents": latents,
+            "fuse_vae_embedding_in_latents": True,
+            "first_frame_latents": z,
+        }
+class WanVideoUnit_FunControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "control_video",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "clip_feature",
+                "y",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        control_video,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+        clip_feature,
+        y,
+    ):
+        if control_video is None:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        control_video = pipe.preprocess_video(control_video)
+        control_latents = pipe.vae.encode(
+            control_video,
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        ).to(dtype=pipe.torch_dtype, device=pipe.device)
+        control_latents = control_latents.to(dtype=pipe.torch_dtype, device=pipe.device)
+        if clip_feature is None or y is None:
+            clip_feature = torch.zeros(
+                (1, 257, 1280), dtype=pipe.torch_dtype, device=pipe.device
+            )
+            y = torch.zeros(
+                (1, 16, (num_frames - 1) // 4 + 1, height // 8, width // 8),
+                dtype=pipe.torch_dtype,
+                device=pipe.device,
+            )
+        else:
+            y = y[:, -16:]
+        y = torch.concat([control_latents, y], dim=1)
+        return {"clip_feature": clip_feature, "y": y}
+class WanVideoUnit_FunReference(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("reference_image", "height", "width", "reference_image"),
+            onload_model_names=("vae",),
+        )
+    def process(self, pipe: WanVideoPipeline, reference_image, height, width):
+        if reference_image is None:
+            return {}
+        pipe.load_models_to_device(["vae"])
+        reference_image = reference_image.resize((width, height))
+        reference_latents = pipe.preprocess_video([reference_image])
+        reference_latents = pipe.vae.encode(reference_latents, device=pipe.device)
+        clip_feature = pipe.preprocess_image(reference_image)
+        clip_feature = pipe.image_encoder.encode_image([clip_feature])
+        return {"reference_latents": reference_latents, "clip_feature": clip_feature}
+class WanVideoUnit_FunCameraControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "height",
+                "width",
+                "num_frames",
+                "camera_control_direction",
+                "camera_control_speed",
+                "camera_control_origin",
+                "latents",
+                "input_image",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        height,
+        width,
+        num_frames,
+        camera_control_direction,
+        camera_control_speed,
+        camera_control_origin,
+        latents,
+        input_image,
+    ):
+        if camera_control_direction is None:
+            return {}
+        camera_control_plucker_embedding = (
+            pipe.dit.control_adapter.process_camera_coordinates(
+                camera_control_direction,
+                num_frames,
+                height,
+                width,
+                camera_control_speed,
+                camera_control_origin,
+            )
+        )
+        control_camera_video = (
+            camera_control_plucker_embedding[:num_frames]
+            .permute([3, 0, 1, 2])
+            .unsqueeze(0)
+        )
+        control_camera_latents = torch.concat(
+            [
+                torch.repeat_interleave(
+                    control_camera_video[:, :, 0:1], repeats=4, dim=2
+                ),
+                control_camera_video[:, :, 1:],
+            ],
+            dim=2,
+        ).transpose(1, 2)
+        b, f, c, h, w = control_camera_latents.shape
+        control_camera_latents = (
+            control_camera_latents.contiguous()
+            .view(b, f // 4, 4, c, h, w)
+            .transpose(2, 3)
+        )
+        control_camera_latents = (
+            control_camera_latents.contiguous()
+            .view(b, f // 4, c * 4, h, w)
+            .transpose(1, 2)
+        )
+        control_camera_latents_input = control_camera_latents.to(
+            device=pipe.device, dtype=pipe.torch_dtype
+        )
+        input_image = input_image.resize((width, height))
+        input_latents = pipe.preprocess_video([input_image])
+        pipe.load_models_to_device(self.onload_model_names)
+        input_latents = pipe.vae.encode(input_latents, device=pipe.device)
+        y = torch.zeros_like(latents).to(pipe.device)
+        y[:, :, :1] = input_latents
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"control_camera_latents_input": control_camera_latents_input, "y": y}
+class WanVideoUnit_SpeedControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=("motion_bucket_id",))
+    def process(self, pipe: WanVideoPipeline, motion_bucket_id):
+        if motion_bucket_id is None:
+            return {}
+        motion_bucket_id = torch.Tensor((motion_bucket_id,)).to(
+            dtype=pipe.torch_dtype, device=pipe.device
+        )
+        return {"motion_bucket_id": motion_bucket_id}
+class WanVideoUnit_VACE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "vace_video",
+                "vace_video_mask",
+                "vace_reference_image",
+                "vace_scale",
+                "height",
+                "width",
+                "num_frames",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        vace_video,
+        vace_video_mask,
+        vace_reference_image,
+        vace_scale,
+        height,
+        width,
+        num_frames,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if (
+            vace_video is not None
+            or vace_video_mask is not None
+            or vace_reference_image is not None
+        ):
+            pipe.load_models_to_device(["vae"])
+            if vace_video is None:
+                vace_video = torch.zeros(
+                    (1, 3, num_frames, height, width),
+                    dtype=pipe.torch_dtype,
+                    device=pipe.device,
+                )
+            else:
+                vace_video = pipe.preprocess_video(vace_video)
+            if vace_video_mask is None:
+                vace_video_mask = torch.ones_like(vace_video)
+            else:
+                vace_video_mask = pipe.preprocess_video(
+                    vace_video_mask, min_value=0, max_value=1
+                )
+            inactive = vace_video * (1 - vace_video_mask) + 0 * vace_video_mask
+            reactive = vace_video * vace_video_mask + 0 * (1 - vace_video_mask)
+            inactive = pipe.vae.encode(
+                inactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            reactive = pipe.vae.encode(
+                reactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            vace_video_latents = torch.concat((inactive, reactive), dim=1)
+            vace_mask_latents = rearrange(
+                vace_video_mask[0, 0], "T (H P) (W Q) -> 1 (P Q) T H W", P=8, Q=8
+            )
+            vace_mask_latents = torch.nn.functional.interpolate(
+                vace_mask_latents,
+                size=(
+                    (vace_mask_latents.shape[2] + 3) // 4,
+                    vace_mask_latents.shape[3],
+                    vace_mask_latents.shape[4],
+                ),
+                mode="nearest-exact",
+            )
+            if vace_reference_image is None:
+                pass
+            else:
+                vace_reference_image = pipe.preprocess_video([vace_reference_image])
+                vace_reference_latents = pipe.vae.encode(
+                    vace_reference_image,
+                    device=pipe.device,
+                    tiled=tiled,
+                    tile_size=tile_size,
+                    tile_stride=tile_stride,
+                ).to(dtype=pipe.torch_dtype, device=pipe.device)
+                vace_reference_latents = torch.concat(
+                    (vace_reference_latents, torch.zeros_like(vace_reference_latents)),
+                    dim=1,
+                )
+                vace_video_latents = torch.concat(
+                    (vace_reference_latents, vace_video_latents), dim=2
+                )
+                vace_mask_latents = torch.concat(
+                    (torch.zeros_like(vace_mask_latents[:, :, :1]), vace_mask_latents),
+                    dim=2,
+                )
+            vace_context = torch.concat((vace_video_latents, vace_mask_latents), dim=1)
+            return {"vace_context": vace_context, "vace_scale": vace_scale}
+        else:
+            return {"vace_context": None, "vace_scale": vace_scale}
+class WanVideoUnit_UnifiedSequenceParallel(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=())
+    def process(self, pipe: WanVideoPipeline):
+        if hasattr(pipe, "use_unified_sequence_parallel"):
+            if pipe.use_unified_sequence_parallel:
+                return {"use_unified_sequence_parallel": True}
+        return {}
+class WanVideoUnit_TeaCache(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={
+                "num_inference_steps": "num_inference_steps",
+                "tea_cache_l1_thresh": "tea_cache_l1_thresh",
+                "tea_cache_model_id": "tea_cache_model_id",
+            },
+            input_params_nega={
+                "num_inference_steps": "num_inference_steps",
+                "tea_cache_l1_thresh": "tea_cache_l1_thresh",
+                "tea_cache_model_id": "tea_cache_model_id",
+            },
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        num_inference_steps,
+        tea_cache_l1_thresh,
+        tea_cache_model_id,
+    ):
+        if tea_cache_l1_thresh is None:
+            return {}
+        return {
+            "tea_cache": TeaCache(
+                num_inference_steps,
+                rel_l1_thresh=tea_cache_l1_thresh,
+                model_id=tea_cache_model_id,
+            )
+        }
+class WanVideoUnit_CfgMerger(PipelineUnit):
+    def __init__(self):
+        super().__init__(take_over=True)
+        self.concat_tensor_names = ["context", "clip_feature", "y", "reference_latents"]
+    def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, inputs_nega):
+        if not inputs_shared["cfg_merge"]:
+            return inputs_shared, inputs_posi, inputs_nega
+        for name in self.concat_tensor_names:
+            tensor_posi = inputs_posi.get(name)
+            tensor_nega = inputs_nega.get(name)
+            tensor_shared = inputs_shared.get(name)
+            if tensor_posi is not None and tensor_nega is not None:
+                inputs_shared[name] = torch.concat((tensor_posi, tensor_nega), dim=0)
+            elif tensor_shared is not None:
+                inputs_shared[name] = torch.concat(
+                    (tensor_shared, tensor_shared), dim=0
+                )
+        inputs_posi.clear()
+        inputs_nega.clear()
+        return inputs_shared, inputs_posi, inputs_nega
+class TeaCache:
+    def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
+        self.num_inference_steps = num_inference_steps
+        self.step = 0
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.rel_l1_thresh = rel_l1_thresh
+        self.previous_residual = None
+        self.previous_hidden_states = None
+        self.coefficients_dict = {
+            "Wan2.1-T2V-1.3B": [
+                -5.21862437e04,
+                9.23041404e03,
+                -5.28275948e02,
+                1.36987616e01,
+                -4.99875664e-02,
+            ],
+            "Wan2.1-T2V-14B": [
+                -3.03318725e05,
+                4.90537029e04,
+                -2.65530556e03,
+                5.87365115e01,
+                -3.15583525e-01,
+            ],
+            "Wan2.1-I2V-14B-480P": [
+                2.57151496e05,
+                -3.54229917e04,
+                1.40286849e03,
+                -1.35890334e01,
+                1.32517977e-01,
+            ],
+            "Wan2.1-I2V-14B-720P": [
+                8.10705460e03,
+                2.13393892e03,
+                -3.72934672e02,
+                1.66203073e01,
+                -4.17769401e-02,
+            ],
+        }
+        if model_id not in self.coefficients_dict:
+            supported_model_ids = ", ".join([i for i in self.coefficients_dict])
+            raise ValueError(
+                f"{model_id} is not a supported TeaCache model id. Please choose a valid model id in ({supported_model_ids})."
+            )
+        self.coefficients = self.coefficients_dict[model_id]
+    def check(self, dit: WanModel, x, t_mod):
+        modulated_inp = t_mod.clone()
+        if self.step == 0 or self.step == self.num_inference_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = self.coefficients
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(
+                (
+                    (modulated_inp - self.previous_modulated_input).abs().mean()
+                    / self.previous_modulated_input.abs().mean()
+                )
+                .cpu()
+                .item()
+            )
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.step += 1
+        if self.step == self.num_inference_steps:
+            self.step = 0
+        if should_calc:
+            self.previous_hidden_states = x.clone()
+        return not should_calc
+    def store(self, hidden_states):
+        self.previous_residual = hidden_states - self.previous_hidden_states
+        self.previous_hidden_states = None
+    def update(self, hidden_states):
+        hidden_states = hidden_states + self.previous_residual
+        return hidden_states
+class TemporalTiler_BCTHW:
+    def __init__(self):
+        pass
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,)
+            )
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, T, _, _ = data.shape
+        t = self.build_1d_mask(T, is_bound[0], is_bound[1], border_width[0])
+        mask = repeat(t, "T -> 1 1 T 1 1")
+        return mask
+    def run(
+        self,
+        model_fn,
+        sliding_window_size,
+        sliding_window_stride,
+        computation_device,
+        computation_dtype,
+        model_kwargs,
+        tensor_names,
+        batch_size=None,
+    ):
+        tensor_names = [
+            tensor_name
+            for tensor_name in tensor_names
+            if model_kwargs.get(tensor_name) is not None
+        ]
+        tensor_dict = {
+            tensor_name: model_kwargs[tensor_name] for tensor_name in tensor_names
+        }
+        B, C, T, H, W = tensor_dict[tensor_names[0]].shape
+        if batch_size is not None:
+            B *= batch_size
+        data_device, data_dtype = (
+            tensor_dict[tensor_names[0]].device,
+            tensor_dict[tensor_names[0]].dtype,
+        )
+        value = torch.zeros((B, C, T, H, W), device=data_device, dtype=data_dtype)
+        weight = torch.zeros((1, 1, T, 1, 1), device=data_device, dtype=data_dtype)
+        for t in range(0, T, sliding_window_stride):
+            if (
+                t - sliding_window_stride >= 0
+                and t - sliding_window_stride + sliding_window_size >= T
+            ):
+                continue
+            t_ = min(t + sliding_window_size, T)
+            model_kwargs.update(
+                {
+                    tensor_name: tensor_dict[tensor_name][:, :, t:t_:, :].to(
+                        device=computation_device, dtype=computation_dtype
+                    )
+                    for tensor_name in tensor_names
+                }
+            )
+            model_output = model_fn(**model_kwargs).to(
+                device=data_device, dtype=data_dtype
+            )
+            mask = self.build_mask(
+                model_output,
+                is_bound=(t == 0, t_ == T),
+                border_width=(sliding_window_size - sliding_window_stride,),
+            ).to(device=data_device, dtype=data_dtype)
+            value[:, :, t:t_, :, :] += model_output * mask
+            weight[:, :, t:t_, :, :] += mask
+        value /= weight
+        model_kwargs.update(tensor_dict)
+        return value
+def model_fn_wan_video(
+    dit: WanModel,
+    motion_controller: WanMotionControllerModel = None,
+    vace: VaceWanModel = None,
+    latents: torch.Tensor = None,
+    timestep: torch.Tensor = None,
+    context: torch.Tensor = None,
+    clip_feature: Optional[torch.Tensor] = None,
+    y: Optional[torch.Tensor] = None,
+    reference_latents=None,
+    vace_context=None,
+    vace_scale=1.0,
+    tea_cache: TeaCache = None,
+    use_unified_sequence_parallel: bool = False,
+    motion_bucket_id: Optional[torch.Tensor] = None,
+    sliding_window_size: Optional[int] = None,
+    sliding_window_stride: Optional[int] = None,
+    cfg_merge: bool = False,
+    use_gradient_checkpointing: bool = False,
+    use_gradient_checkpointing_offload: bool = False,
+    control_camera_latents_input=None,
+    fuse_vae_embedding_in_latents: bool = False,
+    ip_image=None,
+    **kwargs,
+):
+    if sliding_window_size is not None and sliding_window_stride is not None:
+        model_kwargs = dict(
+            dit=dit,
+            motion_controller=motion_controller,
+            vace=vace,
+            latents=latents,
+            timestep=timestep,
+            context=context,
+            clip_feature=clip_feature,
+            y=y,
+            reference_latents=reference_latents,
+            vace_context=vace_context,
+            vace_scale=vace_scale,
+            tea_cache=tea_cache,
+            use_unified_sequence_parallel=use_unified_sequence_parallel,
+            motion_bucket_id=motion_bucket_id,
+        )
+        return TemporalTiler_BCTHW().run(
+            model_fn_wan_video,
+            sliding_window_size,
+            sliding_window_stride,
+            latents.device,
+            latents.dtype,
+            model_kwargs=model_kwargs,
+            tensor_names=["latents", "y"],
+            batch_size=2 if cfg_merge else 1,
+        )
+    if use_unified_sequence_parallel:
+        import torch.distributed as dist
+        from xfuser.core.distributed import (
+            get_sequence_parallel_rank,
+            get_sequence_parallel_world_size,
+            get_sp_group,
+        )
+    x_ip = None
+    t_mod_ip = None
+    # Timestep
+    if dit.seperated_timestep and fuse_vae_embedding_in_latents:
+        timestep = torch.concat(
+            [
+                torch.zeros(
+                    (1, latents.shape[3] * latents.shape[4] // 4),
+                    dtype=latents.dtype,
+                    device=latents.device,
+                ),
+                torch.ones(
+                    (latents.shape[2] - 1, latents.shape[3] * latents.shape[4] // 4),
+                    dtype=latents.dtype,
+                    device=latents.device,
+                )
+                * timestep,
+            ]
+        ).flatten()
+        t = dit.time_embedding(
+            sinusoidal_embedding_1d(dit.freq_dim, timestep).unsqueeze(0)
+        )
+        t_mod = dit.time_projection(t).unflatten(2, (6, dit.dim))
+    else:
+        t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
+        t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
+    if ip_image is not None:
+        timestep_ip = torch.zeros_like(timestep)  # [B] with 0s
+        t_ip = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep_ip))
+        t_mod_ip = dit.time_projection(t_ip).unflatten(1, (6, dit.dim))
+    # Motion Controller
+    if motion_bucket_id is not None and motion_controller is not None:
+        t_mod = t_mod + motion_controller(motion_bucket_id).unflatten(1, (6, dit.dim))
+    context = dit.text_embedding(context)
+    x = latents
+    # Merged cfg
+    if x.shape[0] != context.shape[0]:
+        x = torch.concat([x] * context.shape[0], dim=0)
+    if timestep.shape[0] != context.shape[0]:
+        timestep = torch.concat([timestep] * context.shape[0], dim=0)
+    # Image Embedding
+    if y is not None and dit.require_vae_embedding:
+        x = torch.cat([x, y], dim=1)
+    if clip_feature is not None and dit.require_clip_embedding:
+        clip_embdding = dit.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    # Add camera control
+    x, (f, h, w) = dit.patchify(x, control_camera_latents_input)
+    # Reference image
+    if reference_latents is not None:
+        if len(reference_latents.shape) == 5:
+            reference_latents = reference_latents[:, :, 0]
+        reference_latents = dit.ref_conv(reference_latents).flatten(2).transpose(1, 2)
+        x = torch.concat([reference_latents, x], dim=1)
+        f += 1
+    offset = 1
+    freqs = (
+        torch.cat(
+            [
+                dit.freqs[0][offset : f + offset].view(f, 1, 1, -1).expand(f, h, w, -1),
+                dit.freqs[1][offset : h + offset].view(1, h, 1, -1).expand(f, h, w, -1),
+                dit.freqs[2][offset : w + offset].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        )
+        .reshape(f * h * w, 1, -1)
+        .to(x.device)
+    )
+    ############################################################################################
+    if ip_image is not None:
+        x_ip, (f_ip, h_ip, w_ip) = dit.patchify(
+            ip_image
+        )  # x_ip [1, 1024, 5120] [B, N, D]   f_ip = 1  h_ip = 32  w_ip = 32
+        freqs_ip = (
+            torch.cat(
+                [
+                    dit.freqs[0][0].view(f_ip, 1, 1, -1).expand(f_ip, h_ip, w_ip, -1),
+                    dit.freqs[1][h + offset : h + offset + h_ip]
+                    .view(1, h_ip, 1, -1)
+                    .expand(f_ip, h_ip, w_ip, -1),
+                    dit.freqs[2][w + offset : w + offset + w_ip]
+                    .view(1, 1, w_ip, -1)
+                    .expand(f_ip, h_ip, w_ip, -1),
+                ],
+                dim=-1,
+            )
+            .reshape(f_ip * h_ip * w_ip, 1, -1)
+            .to(x_ip.device)
+        )
+        freqs_original = freqs
+        freqs = torch.cat([freqs, freqs_ip], dim=0)
+    ############################################################################################
+    else:
+        freqs_original = freqs
+    # TeaCache
+    if tea_cache is not None:
+        tea_cache_update = tea_cache.check(dit, x, t_mod)
+    else:
+        tea_cache_update = False
+    if vace_context is not None:
+        vace_hints = vace(x, vace_context, context, t_mod, freqs_original)
+    # blocks
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            x = torch.chunk(x, get_sequence_parallel_world_size(), dim=1)[
+                get_sequence_parallel_rank()
+            ]
+    if tea_cache_update:
+        x = tea_cache.update(x)
+    else:
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block_id, block in enumerate(dit.blocks):
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    x, x_ip = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        x_ip=x_ip,
+                        t_mod_ip=t_mod_ip,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                x, x_ip = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    context,
+                    t_mod,
+                    freqs,
+                    x_ip=x_ip,
+                    t_mod_ip=t_mod_ip,
+                    use_reentrant=False,
+                )
+            else:
+                x, x_ip = block(x, context, t_mod, freqs, x_ip=x_ip, t_mod_ip=t_mod_ip)
+            if vace_context is not None and block_id in vace.vace_layers_mapping:
+                current_vace_hint = vace_hints[vace.vace_layers_mapping[block_id]]
+                if (
+                    use_unified_sequence_parallel
+                    and dist.is_initialized()
+                    and dist.get_world_size() > 1
+                ):
+                    current_vace_hint = torch.chunk(
+                        current_vace_hint, get_sequence_parallel_world_size(), dim=1
+                    )[get_sequence_parallel_rank()]
+                x = x + current_vace_hint * vace_scale
+        if tea_cache is not None:
+            tea_cache.store(x)
+    x = dit.head(x, t)
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            x = get_sp_group().all_gather(x, dim=1)
+    # Remove reference latents
+    if reference_latents is not None:
+        x = x[:, reference_latents.shape[1] :]
+        f -= 1
+    x = dit.unpatchify(x, (f, h, w))
+    return x

pipelines/wan_video_face_swap.py ADDED Viewed

	@@ -0,0 +1,1786 @@

+import torch, types
+import numpy as np
+from PIL import Image
+from einops import repeat
+from typing import Optional, Union
+from einops import rearrange
+import numpy as np
+from tqdm import tqdm
+from typing import Optional
+from typing_extensions import Literal
+import imageio
+import os
+from typing import List
+import cv2
+from utils import BasePipeline, ModelConfig, PipelineUnit, PipelineUnitRunner
+from models import ModelManager, load_state_dict
+from models.wan_video_dit import WanModel, RMSNorm, sinusoidal_embedding_1d
+from models.wan_video_text_encoder import (
+    WanTextEncoder,
+    T5RelativeEmbedding,
+    T5LayerNorm,
+)
+from models.wan_video_vae import WanVideoVAE, RMS_norm, CausalConv3d, Upsample
+from models.wan_video_image_encoder import WanImageEncoder
+from models.wan_video_vace import VaceWanModel
+from models.wan_video_motion_controller import WanMotionControllerModel
+from schedulers.flow_match import FlowMatchScheduler
+from prompters import WanPrompter
+from vram_management import (
+    enable_vram_management,
+    AutoWrappedModule,
+    AutoWrappedLinear,
+    WanAutoCastLayerNorm,
+)
+from lora import GeneralLoRALoader
+def load_video_as_list(video_path: str) -> List[Image.Image]:
+    if not os.path.isfile(video_path):
+        raise FileNotFoundError(video_path)
+    reader = imageio.get_reader(video_path)
+    frames = []
+    for i, frame_data in enumerate(reader):
+        pil_image = Image.fromarray(frame_data)
+        frames.append(pil_image)
+    reader.close()
+    return frames
+class WanVideoPipeline_FaceSwap(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.bfloat16, tokenizer_path=None):
+        super().__init__(
+            device=device,
+            torch_dtype=torch_dtype,
+            height_division_factor=16,
+            width_division_factor=16,
+            time_division_factor=4,
+            time_division_remainder=1,
+        )
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        self.dit: WanModel = None
+        self.dit2: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.motion_controller: WanMotionControllerModel = None
+        self.vace: VaceWanModel = None
+        self.in_iteration_models = ("dit", "motion_controller", "vace")
+        self.in_iteration_models_2 = ("dit2", "motion_controller", "vace")
+        self.unit_runner = PipelineUnitRunner()
+        self.units = [
+            WanVideoUnit_ShapeChecker(),
+            WanVideoUnit_NoiseInitializer(),
+            WanVideoUnit_InputVideoEmbedder(),
+            WanVideoUnit_PromptEmbedder(),
+            WanVideoUnit_ImageEmbedderVAE(),
+            WanVideoUnit_ImageEmbedderCLIP(),
+            WanVideoUnit_ImageEmbedderFused(),
+            WanVideoUnit_FunControl(),
+            WanVideoUnit_FunReference(),
+            WanVideoUnit_FunCameraControl(),
+            WanVideoUnit_SpeedControl(),
+            WanVideoUnit_VACE(),
+            WanVideoUnit_UnifiedSequenceParallel(),
+            WanVideoUnit_TeaCache(),
+            WanVideoUnit_CfgMerger(),
+        ]
+        self.model_fn = model_fn_wan_video
+    def encode_ip_image(self, ip_image):
+        self.load_models_to_device(["vae"])
+        ip_image = (
+            torch.tensor(np.array(ip_image)).permute(2, 0, 1).float() / 255.0
+        )  # [3, H, W]
+        ip_image = (
+            ip_image.unsqueeze(1).unsqueeze(0).to(dtype=self.torch_dtype)
+        )  # [B, 3, 1, H, W]
+        ip_image = ip_image * 2 - 1
+        ip_image_latent = self.vae.encode(ip_image, device=self.device, tiled=False)
+        return ip_image_latent
+    def load_lora(self, module, path, alpha=1):
+        loader = GeneralLoRALoader(torch_dtype=self.torch_dtype, device=self.device)
+        lora = load_state_dict(path, torch_dtype=self.torch_dtype, device=self.device)
+        loader.load(module, lora, alpha=alpha)
+    def training_loss(self, **inputs):
+        max_timestep_boundary = int(
+            inputs.get("max_timestep_boundary", 1) * self.scheduler.num_train_timesteps
+        )
+        min_timestep_boundary = int(
+            inputs.get("min_timestep_boundary", 0) * self.scheduler.num_train_timesteps
+        )
+        timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
+        timestep = self.scheduler.timesteps[timestep_id].to(
+            dtype=self.torch_dtype, device=self.device
+        )
+        inputs["latents"] = self.scheduler.add_noise(
+            inputs["input_latents"], inputs["noise"], timestep
+        )
+        training_target = self.scheduler.training_target(
+            inputs["input_latents"], inputs["noise"], timestep
+        )
+        noise_pred = self.model_fn(**inputs, timestep=timestep)
+        loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
+        loss = loss * self.scheduler.training_weight(timestep)
+        return loss
+    def enable_vram_management(
+        self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5
+    ):
+        self.vram_management_enabled = True
+        if num_persistent_param_in_dit is not None:
+            vram_limit = None
+        else:
+            if vram_limit is None:
+                vram_limit = self.get_vram()
+            vram_limit = vram_limit - vram_buffer
+        if self.text_encoder is not None:
+            dtype = next(iter(self.text_encoder.parameters())).dtype
+            enable_vram_management(
+                self.text_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Embedding: AutoWrappedModule,
+                    T5RelativeEmbedding: AutoWrappedModule,
+                    T5LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.dit is not None:
+            dtype = next(iter(self.dit.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.dit2 is not None:
+            dtype = next(iter(self.dit2.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit2,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.vae is not None:
+            dtype = next(iter(self.vae.parameters())).dtype
+            enable_vram_management(
+                self.vae,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    RMS_norm: AutoWrappedModule,
+                    CausalConv3d: AutoWrappedModule,
+                    Upsample: AutoWrappedModule,
+                    torch.nn.SiLU: AutoWrappedModule,
+                    torch.nn.Dropout: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=self.device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.motion_controller is not None:
+            dtype = next(iter(self.motion_controller.parameters())).dtype
+            enable_vram_management(
+                self.motion_controller,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.vace is not None:
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.vace,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+    def initialize_usp(self):
+        import torch.distributed as dist
+        from xfuser.core.distributed import (
+            initialize_model_parallel,
+            init_distributed_environment,
+        )
+        dist.init_process_group(backend="nccl", init_method="env://")
+        init_distributed_environment(
+            rank=dist.get_rank(), world_size=dist.get_world_size()
+        )
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+        torch.cuda.set_device(dist.get_rank())
+    def enable_usp(self):
+        from xfuser.core.distributed import get_sequence_parallel_world_size
+        from distributed.xdit_context_parallel import (
+            usp_attn_forward,
+            usp_dit_forward,
+        )
+        for block in self.dit.blocks:
+            block.self_attn.forward = types.MethodType(
+                usp_attn_forward, block.self_attn
+            )
+        self.dit.forward = types.MethodType(usp_dit_forward, self.dit)
+        if self.dit2 is not None:
+            for block in self.dit2.blocks:
+                block.self_attn.forward = types.MethodType(
+                    usp_attn_forward, block.self_attn
+                )
+            self.dit2.forward = types.MethodType(usp_dit_forward, self.dit2)
+        self.sp_size = get_sequence_parallel_world_size()
+        self.use_unified_sequence_parallel = True
+    @staticmethod
+    def from_pretrained(
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device: Union[str, torch.device] = "cuda",
+        model_configs: list[ModelConfig] = [],
+        tokenizer_config: ModelConfig = ModelConfig(
+            model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"
+        ),
+        redirect_common_files: bool = True,
+        use_usp=False,
+    ):
+        # Redirect model path
+        if redirect_common_files:
+            redirect_dict = {
+                "models_t5_umt5-xxl-enc-bf16.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "Wan2.1_VAE.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth": "Wan-AI/Wan2.1-I2V-14B-480P",
+            }
+            for model_config in model_configs:
+                if (
+                    model_config.origin_file_pattern is None
+                    or model_config.model_id is None
+                ):
+                    continue
+                if (
+                    model_config.origin_file_pattern in redirect_dict
+                    and model_config.model_id
+                    != redirect_dict[model_config.origin_file_pattern]
+                ):
+                    print(
+                        f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to ({redirect_dict[model_config.origin_file_pattern]}, {model_config.origin_file_pattern}). You can use `redirect_common_files=False` to disable file redirection."
+                    )
+                    model_config.model_id = redirect_dict[
+                        model_config.origin_file_pattern
+                    ]
+        # Initialize pipeline
+        pipe = WanVideoPipeline_FaceSwap(device=device, torch_dtype=torch_dtype)
+        if use_usp:
+            pipe.initialize_usp()
+        # Download and load models
+        model_manager = ModelManager()
+        for model_config in model_configs:
+            model_config.download_if_necessary(use_usp=use_usp)
+            model_manager.load_model(
+                model_config.path,
+                device=model_config.offload_device or device,
+                torch_dtype=model_config.offload_dtype or torch_dtype,
+            )
+        # Load models
+        pipe.text_encoder = model_manager.fetch_model("wan_video_text_encoder")
+        dit = model_manager.fetch_model("wan_video_dit", index=2)
+        if isinstance(dit, list):
+            pipe.dit, pipe.dit2 = dit
+        else:
+            pipe.dit = dit
+        pipe.vae = model_manager.fetch_model("wan_video_vae")
+        pipe.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+        pipe.motion_controller = model_manager.fetch_model(
+            "wan_video_motion_controller"
+        )
+        pipe.vace = model_manager.fetch_model("wan_video_vace")
+        # Size division factor
+        if pipe.vae is not None:
+            pipe.height_division_factor = pipe.vae.upsampling_factor * 2
+            pipe.width_division_factor = pipe.vae.upsampling_factor * 2
+        # Initialize tokenizer
+        tokenizer_config.download_if_necessary(use_usp=use_usp)
+        pipe.prompter.fetch_models(pipe.text_encoder)
+        pipe.prompter.fetch_tokenizer(tokenizer_config.path)
+        # Unified Sequence Parallel
+        if use_usp:
+            pipe.enable_usp()
+        return pipe
+    @torch.no_grad()
+    def __call__(
+        self,
+        # Prompt
+        prompt: str,
+        negative_prompt: Optional[str] = "",
+        # Image-to-video
+        input_image: Optional[Image.Image] = None,
+        # First-last-frame-to-video
+        end_image: Optional[Image.Image] = None,
+        # Video-to-video
+        input_video: Optional[list[Image.Image]] = None,
+        denoising_strength: Optional[float] = 1,
+        # ControlNet
+        control_video: Optional[list[Image.Image]] = None,
+        reference_image: Optional[Image.Image] = None,
+        # Camera control
+        camera_control_direction: Optional[
+            Literal[
+                "Left",
+                "Right",
+                "Up",
+                "Down",
+                "LeftUp",
+                "LeftDown",
+                "RightUp",
+                "RightDown",
+            ]
+        ] = None,
+        camera_control_speed: Optional[float] = 1 / 54,
+        camera_control_origin: Optional[tuple] = (
+            0,
+            0.532139961,
+            0.946026558,
+            0.5,
+            0.5,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            0,
+        ),
+        # VACE
+        vace_video: Optional[list[Image.Image]] = None,
+        vace_video_mask: Optional[Image.Image] = None,
+        vace_reference_image: Optional[Image.Image] = None,
+        vace_scale: Optional[float] = 1.0,
+        # Randomness
+        seed: Optional[int] = None,
+        rand_device: Optional[str] = "cpu",
+        # Shape
+        height: Optional[int] = 480,
+        width: Optional[int] = 832,
+        num_frames=81,
+        # Classifier-free guidance
+        cfg_scale: Optional[float] = 5.0,
+        cfg_merge: Optional[bool] = False,
+        # Boundary
+        switch_DiT_boundary: Optional[float] = 0.875,
+        # Scheduler
+        num_inference_steps: Optional[int] = 50,
+        sigma_shift: Optional[float] = 5.0,
+        # Speed control
+        motion_bucket_id: Optional[int] = None,
+        # VAE tiling
+        tiled: Optional[bool] = True,
+        tile_size: Optional[tuple[int, int]] = (30, 52),
+        tile_stride: Optional[tuple[int, int]] = (15, 26),
+        # Sliding window
+        sliding_window_size: Optional[int] = None,
+        sliding_window_stride: Optional[int] = None,
+        # Teacache
+        tea_cache_l1_thresh: Optional[float] = None,
+        tea_cache_model_id: Optional[str] = "",
+        # progress_bar
+        progress_bar_cmd=tqdm,
+        # Stand-In
+        face_mask=None,
+        ip_image=None,
+        force_background_consistency=False
+    ):
+        if ip_image is not None:
+            ip_image = self.encode_ip_image(ip_image)
+        # Scheduler
+        self.scheduler.set_timesteps(
+            num_inference_steps,
+            denoising_strength=denoising_strength,
+            shift=sigma_shift,
+        )
+        # Inputs
+        inputs_posi = {
+            "prompt": prompt,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_nega = {
+            "negative_prompt": negative_prompt,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_shared = {
+            "input_image": input_image,
+            "end_image": end_image,
+            "input_video": input_video,
+            "denoising_strength": denoising_strength,
+            "control_video": control_video,
+            "reference_image": reference_image,
+            "camera_control_direction": camera_control_direction,
+            "camera_control_speed": camera_control_speed,
+            "camera_control_origin": camera_control_origin,
+            "vace_video": vace_video,
+            "vace_video_mask": vace_video_mask,
+            "vace_reference_image": vace_reference_image,
+            "vace_scale": vace_scale,
+            "seed": seed,
+            "rand_device": rand_device,
+            "height": height,
+            "width": width,
+            "num_frames": num_frames,
+            "cfg_scale": cfg_scale,
+            "cfg_merge": cfg_merge,
+            "sigma_shift": sigma_shift,
+            "motion_bucket_id": motion_bucket_id,
+            "tiled": tiled,
+            "tile_size": tile_size,
+            "tile_stride": tile_stride,
+            "sliding_window_size": sliding_window_size,
+            "sliding_window_stride": sliding_window_stride,
+            "ip_image": ip_image,
+        }
+        for unit in self.units:
+            inputs_shared, inputs_posi, inputs_nega = self.unit_runner(
+                unit, self, inputs_shared, inputs_posi, inputs_nega
+            )
+        if face_mask is not None:
+            mask_processed = self.preprocess_video(face_mask)
+            mask_processed = mask_processed[:, 0:1, ...]
+            latent_mask = torch.nn.functional.interpolate(
+                mask_processed,
+                size=inputs_shared["latents"].shape[2:],
+                mode="nearest-exact",
+            )
+        # Denoise
+        self.load_models_to_device(self.in_iteration_models)
+        models = {name: getattr(self, name) for name in self.in_iteration_models}
+        for progress_id, timestep in enumerate(
+            progress_bar_cmd(self.scheduler.timesteps)
+        ):
+            # Switch DiT if necessary
+            if (
+                timestep.item()
+                < switch_DiT_boundary * self.scheduler.num_train_timesteps
+                and self.dit2 is not None
+                and not models["dit"] is self.dit2
+            ):
+                self.load_models_to_device(self.in_iteration_models_2)
+                models["dit"] = self.dit2
+            # Timestep
+            timestep = timestep.unsqueeze(0).to(
+                dtype=self.torch_dtype, device=self.device
+            )
+            # Inference
+            noise_pred_posi = self.model_fn(
+                **models, **inputs_shared, **inputs_posi, timestep=timestep
+            )
+            inputs_shared["ip_image"] = None
+            if cfg_scale != 1.0:
+                if cfg_merge:
+                    noise_pred_posi, noise_pred_nega = noise_pred_posi.chunk(2, dim=0)
+                else:
+                    noise_pred_nega = self.model_fn(
+                        **models, **inputs_shared, **inputs_nega, timestep=timestep
+                    )
+                noise_pred = noise_pred_nega + cfg_scale * (
+                    noise_pred_posi - noise_pred_nega
+                )
+            else:
+                noise_pred = noise_pred_posi
+            # Scheduler
+            inputs_shared["latents"] = self.scheduler.step(
+                noise_pred,
+                self.scheduler.timesteps[progress_id],
+                inputs_shared["latents"],
+            )
+            if force_background_consistency:
+                if (
+                    inputs_shared["input_latents"] is not None
+                    and latent_mask is not None
+                ):
+                    if progress_id == len(self.scheduler.timesteps) - 1:
+                        noised_original_latents = inputs_shared["input_latents"]
+                    else:
+                        next_timestep = self.scheduler.timesteps[progress_id + 1]
+                        noised_original_latents = self.scheduler.add_noise(
+                            inputs_shared["input_latents"],
+                            inputs_shared["noise"],
+                            timestep=next_timestep,
+                        )
+                    hard_mask = (latent_mask > 0.5).to(
+                        dtype=inputs_shared["latents"].dtype
+                    )
+                    inputs_shared["latents"] = (
+                        1 - hard_mask
+                    ) * noised_original_latents + hard_mask * inputs_shared["latents"]
+            if "first_frame_latents" in inputs_shared:
+                inputs_shared["latents"][:, :, 0:1] = inputs_shared[
+                    "first_frame_latents"
+                ]
+        if vace_reference_image is not None:
+            inputs_shared["latents"] = inputs_shared["latents"][:, :, 1:]
+        # Decode
+        self.load_models_to_device(["vae"])
+        video = self.vae.decode(
+            inputs_shared["latents"],
+            device=self.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        video = self.vae_output_to_video(video)
+        self.load_models_to_device([])
+        return video
+class WanVideoUnit_ShapeChecker(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=("height", "width", "num_frames"))
+    def process(self, pipe: WanVideoPipeline_FaceSwap, height, width, num_frames):
+        height, width, num_frames = pipe.check_resize_height_width(
+            height, width, num_frames
+        )
+        return {"height": height, "width": width, "num_frames": num_frames}
+class WanVideoUnit_NoiseInitializer(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "height",
+                "width",
+                "num_frames",
+                "seed",
+                "rand_device",
+                "vace_reference_image",
+            )
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        height,
+        width,
+        num_frames,
+        seed,
+        rand_device,
+        vace_reference_image,
+    ):
+        length = (num_frames - 1) // 4 + 1
+        if vace_reference_image is not None:
+            length += 1
+        shape = (
+            1,
+            pipe.vae.model.z_dim,
+            length,
+            height // pipe.vae.upsampling_factor,
+            width // pipe.vae.upsampling_factor,
+        )
+        noise = pipe.generate_noise(shape, seed=seed, rand_device=rand_device)
+        if vace_reference_image is not None:
+            noise = torch.concat((noise[:, :, -1:], noise[:, :, :-1]), dim=2)
+        return {"noise": noise}
+class WanVideoUnit_InputVideoEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_video",
+                "noise",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "vace_reference_image",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        input_video,
+        noise,
+        tiled,
+        tile_size,
+        tile_stride,
+        vace_reference_image,
+    ):
+        if input_video is None:
+            return {"latents": noise}
+        pipe.load_models_to_device(["vae"])
+        input_video = pipe.preprocess_video(input_video)
+        input_latents = pipe.vae.encode(
+            input_video,
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        ).to(dtype=pipe.torch_dtype, device=pipe.device)
+        if vace_reference_image is not None:
+            vace_reference_image = pipe.preprocess_video([vace_reference_image])
+            vace_reference_latents = pipe.vae.encode(
+                vace_reference_image, device=pipe.device
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            input_latents = torch.concat([vace_reference_latents, input_latents], dim=2)
+        if pipe.scheduler.training:
+            return {"latents": noise, "input_latents": input_latents}
+        else:
+            latents = pipe.scheduler.add_noise(
+                input_latents, noise, timestep=pipe.scheduler.timesteps[0]
+            )
+            return {"latents": latents, "input_latents": input_latents}
+class WanVideoUnit_PromptEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={"prompt": "prompt", "positive": "positive"},
+            input_params_nega={"prompt": "negative_prompt", "positive": "positive"},
+            onload_model_names=("text_encoder",),
+        )
+    def process(self, pipe: WanVideoPipeline_FaceSwap, prompt, positive) -> dict:
+        pipe.load_models_to_device(self.onload_model_names)
+        prompt_emb = pipe.prompter.encode_prompt(
+            prompt, positive=positive, device=pipe.device
+        )
+        return {"context": prompt_emb}
+class WanVideoUnit_ImageEmbedder(PipelineUnit):
+    """
+    Deprecated
+    """
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "end_image",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("image_encoder", "vae"),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        input_image,
+        end_image,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or pipe.image_encoder is None:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        clip_context = pipe.image_encoder.encode_image([image])
+        msk = torch.ones(1, num_frames, height // 8, width // 8, device=pipe.device)
+        msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 2, height, width).to(image.device),
+                    end_image.transpose(0, 1),
+                ],
+                dim=1,
+            )
+            if pipe.dit.has_image_pos_emb:
+                clip_context = torch.concat(
+                    [clip_context, pipe.image_encoder.encode_image([end_image])], dim=1
+                )
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 1, height, width).to(image.device),
+                ],
+                dim=1,
+            )
+        msk = torch.concat(
+            [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1
+        )
+        msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+        msk = msk.transpose(1, 2)[0]
+        y = pipe.vae.encode(
+            [vae_input.to(dtype=pipe.torch_dtype, device=pipe.device)],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )[0]
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        clip_context = clip_context.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context, "y": y}
+class WanVideoUnit_ImageEmbedderCLIP(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("input_image", "end_image", "height", "width"),
+            onload_model_names=("image_encoder",),
+        )
+    def process(
+        self, pipe: WanVideoPipeline_FaceSwap, input_image, end_image, height, width
+    ):
+        if (
+            input_image is None
+            or pipe.image_encoder is None
+            or not pipe.dit.require_clip_embedding
+        ):
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        clip_context = pipe.image_encoder.encode_image([image])
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            if pipe.dit.has_image_pos_emb:
+                clip_context = torch.concat(
+                    [clip_context, pipe.image_encoder.encode_image([end_image])], dim=1
+                )
+        clip_context = clip_context.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context}
+class WanVideoUnit_ImageEmbedderVAE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "end_image",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        input_image,
+        end_image,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or not pipe.dit.require_vae_embedding:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).to(
+            pipe.device
+        )
+        msk = torch.ones(1, num_frames, height // 8, width // 8, device=pipe.device)
+        msk[:, 1:] = 0
+        if end_image is not None:
+            end_image = pipe.preprocess_image(end_image.resize((width, height))).to(
+                pipe.device
+            )
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 2, height, width).to(image.device),
+                    end_image.transpose(0, 1),
+                ],
+                dim=1,
+            )
+            msk[:, -1:] = 1
+        else:
+            vae_input = torch.concat(
+                [
+                    image.transpose(0, 1),
+                    torch.zeros(3, num_frames - 1, height, width).to(image.device),
+                ],
+                dim=1,
+            )
+        msk = torch.concat(
+            [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1
+        )
+        msk = msk.view(1, msk.shape[1] // 4, 4, height // 8, width // 8)
+        msk = msk.transpose(1, 2)[0]
+        y = pipe.vae.encode(
+            [vae_input.to(dtype=pipe.torch_dtype, device=pipe.device)],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )[0]
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        y = torch.concat([msk, y])
+        y = y.unsqueeze(0)
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"y": y}
+class WanVideoUnit_ImageEmbedderFused(PipelineUnit):
+    """
+    Encode input image to latents using VAE. This unit is for Wan-AI/Wan2.2-TI2V-5B.
+    """
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "latents",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        input_image,
+        latents,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if input_image is None or not pipe.dit.fuse_vae_embedding_in_latents:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image.resize((width, height))).transpose(
+            0, 1
+        )
+        z = pipe.vae.encode(
+            [image],
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        latents[:, :, 0:1] = z
+        return {
+            "latents": latents,
+            "fuse_vae_embedding_in_latents": True,
+            "first_frame_latents": z,
+        }
+class WanVideoUnit_FunControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "control_video",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "clip_feature",
+                "y",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        control_video,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+        clip_feature,
+        y,
+    ):
+        if control_video is None:
+            return {}
+        pipe.load_models_to_device(self.onload_model_names)
+        control_video = pipe.preprocess_video(control_video)
+        control_latents = pipe.vae.encode(
+            control_video,
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        ).to(dtype=pipe.torch_dtype, device=pipe.device)
+        control_latents = control_latents.to(dtype=pipe.torch_dtype, device=pipe.device)
+        if clip_feature is None or y is None:
+            clip_feature = torch.zeros(
+                (1, 257, 1280), dtype=pipe.torch_dtype, device=pipe.device
+            )
+            y = torch.zeros(
+                (1, 16, (num_frames - 1) // 4 + 1, height // 8, width // 8),
+                dtype=pipe.torch_dtype,
+                device=pipe.device,
+            )
+        else:
+            y = y[:, -16:]
+        y = torch.concat([control_latents, y], dim=1)
+        return {"clip_feature": clip_feature, "y": y}
+class WanVideoUnit_FunReference(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=("reference_image", "height", "width", "reference_image"),
+            onload_model_names=("vae",),
+        )
+    def process(self, pipe: WanVideoPipeline_FaceSwap, reference_image, height, width):
+        if reference_image is None:
+            return {}
+        pipe.load_models_to_device(["vae"])
+        reference_image = reference_image.resize((width, height))
+        reference_latents = pipe.preprocess_video([reference_image])
+        reference_latents = pipe.vae.encode(reference_latents, device=pipe.device)
+        clip_feature = pipe.preprocess_image(reference_image)
+        clip_feature = pipe.image_encoder.encode_image([clip_feature])
+        return {"reference_latents": reference_latents, "clip_feature": clip_feature}
+class WanVideoUnit_FunCameraControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "height",
+                "width",
+                "num_frames",
+                "camera_control_direction",
+                "camera_control_speed",
+                "camera_control_origin",
+                "latents",
+                "input_image",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        height,
+        width,
+        num_frames,
+        camera_control_direction,
+        camera_control_speed,
+        camera_control_origin,
+        latents,
+        input_image,
+    ):
+        if camera_control_direction is None:
+            return {}
+        camera_control_plucker_embedding = (
+            pipe.dit.control_adapter.process_camera_coordinates(
+                camera_control_direction,
+                num_frames,
+                height,
+                width,
+                camera_control_speed,
+                camera_control_origin,
+            )
+        )
+        control_camera_video = (
+            camera_control_plucker_embedding[:num_frames]
+            .permute([3, 0, 1, 2])
+            .unsqueeze(0)
+        )
+        control_camera_latents = torch.concat(
+            [
+                torch.repeat_interleave(
+                    control_camera_video[:, :, 0:1], repeats=4, dim=2
+                ),
+                control_camera_video[:, :, 1:],
+            ],
+            dim=2,
+        ).transpose(1, 2)
+        b, f, c, h, w = control_camera_latents.shape
+        control_camera_latents = (
+            control_camera_latents.contiguous()
+            .view(b, f // 4, 4, c, h, w)
+            .transpose(2, 3)
+        )
+        control_camera_latents = (
+            control_camera_latents.contiguous()
+            .view(b, f // 4, c * 4, h, w)
+            .transpose(1, 2)
+        )
+        control_camera_latents_input = control_camera_latents.to(
+            device=pipe.device, dtype=pipe.torch_dtype
+        )
+        input_image = input_image.resize((width, height))
+        input_latents = pipe.preprocess_video([input_image])
+        pipe.load_models_to_device(self.onload_model_names)
+        input_latents = pipe.vae.encode(input_latents, device=pipe.device)
+        y = torch.zeros_like(latents).to(pipe.device)
+        y[:, :, :1] = input_latents
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"control_camera_latents_input": control_camera_latents_input, "y": y}
+class WanVideoUnit_SpeedControl(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=("motion_bucket_id",))
+    def process(self, pipe: WanVideoPipeline_FaceSwap, motion_bucket_id):
+        if motion_bucket_id is None:
+            return {}
+        motion_bucket_id = torch.Tensor((motion_bucket_id,)).to(
+            dtype=pipe.torch_dtype, device=pipe.device
+        )
+        return {"motion_bucket_id": motion_bucket_id}
+class WanVideoUnit_VACE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "vace_video",
+                "vace_video_mask",
+                "vace_reference_image",
+                "vace_scale",
+                "height",
+                "width",
+                "num_frames",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        vace_video,
+        vace_video_mask,
+        vace_reference_image,
+        vace_scale,
+        height,
+        width,
+        num_frames,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if (
+            vace_video is not None
+            or vace_video_mask is not None
+            or vace_reference_image is not None
+        ):
+            pipe.load_models_to_device(["vae"])
+            if vace_video is None:
+                vace_video = torch.zeros(
+                    (1, 3, num_frames, height, width),
+                    dtype=pipe.torch_dtype,
+                    device=pipe.device,
+                )
+            else:
+                vace_video = pipe.preprocess_video(vace_video)
+            if vace_video_mask is None:
+                vace_video_mask = torch.ones_like(vace_video)
+            else:
+                vace_video_mask = pipe.preprocess_video(
+                    vace_video_mask, min_value=0, max_value=1
+                )
+            inactive = vace_video * (1 - vace_video_mask) + 0 * vace_video_mask
+            reactive = vace_video * vace_video_mask + 0 * (1 - vace_video_mask)
+            inactive = pipe.vae.encode(
+                inactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            reactive = pipe.vae.encode(
+                reactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            vace_video_latents = torch.concat((inactive, reactive), dim=1)
+            vace_mask_latents = rearrange(
+                vace_video_mask[0, 0], "T (H P) (W Q) -> 1 (P Q) T H W", P=8, Q=8
+            )
+            vace_mask_latents = torch.nn.functional.interpolate(
+                vace_mask_latents,
+                size=(
+                    (vace_mask_latents.shape[2] + 3) // 4,
+                    vace_mask_latents.shape[3],
+                    vace_mask_latents.shape[4],
+                ),
+                mode="nearest-exact",
+            )
+            if vace_reference_image is None:
+                pass
+            else:
+                vace_reference_image = pipe.preprocess_video([vace_reference_image])
+                vace_reference_latents = pipe.vae.encode(
+                    vace_reference_image,
+                    device=pipe.device,
+                    tiled=tiled,
+                    tile_size=tile_size,
+                    tile_stride=tile_stride,
+                ).to(dtype=pipe.torch_dtype, device=pipe.device)
+                vace_reference_latents = torch.concat(
+                    (vace_reference_latents, torch.zeros_like(vace_reference_latents)),
+                    dim=1,
+                )
+                vace_video_latents = torch.concat(
+                    (vace_reference_latents, vace_video_latents), dim=2
+                )
+                vace_mask_latents = torch.concat(
+                    (torch.zeros_like(vace_mask_latents[:, :, :1]), vace_mask_latents),
+                    dim=2,
+                )
+            vace_context = torch.concat((vace_video_latents, vace_mask_latents), dim=1)
+            return {"vace_context": vace_context, "vace_scale": vace_scale}
+        else:
+            return {"vace_context": None, "vace_scale": vace_scale}
+class WanVideoUnit_UnifiedSequenceParallel(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=())
+    def process(self, pipe: WanVideoPipeline_FaceSwap):
+        if hasattr(pipe, "use_unified_sequence_parallel"):
+            if pipe.use_unified_sequence_parallel:
+                return {"use_unified_sequence_parallel": True}
+        return {}
+class WanVideoUnit_TeaCache(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={
+                "num_inference_steps": "num_inference_steps",
+                "tea_cache_l1_thresh": "tea_cache_l1_thresh",
+                "tea_cache_model_id": "tea_cache_model_id",
+            },
+            input_params_nega={
+                "num_inference_steps": "num_inference_steps",
+                "tea_cache_l1_thresh": "tea_cache_l1_thresh",
+                "tea_cache_model_id": "tea_cache_model_id",
+            },
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline_FaceSwap,
+        num_inference_steps,
+        tea_cache_l1_thresh,
+        tea_cache_model_id,
+    ):
+        if tea_cache_l1_thresh is None:
+            return {}
+        return {
+            "tea_cache": TeaCache(
+                num_inference_steps,
+                rel_l1_thresh=tea_cache_l1_thresh,
+                model_id=tea_cache_model_id,
+            )
+        }
+class WanVideoUnit_CfgMerger(PipelineUnit):
+    def __init__(self):
+        super().__init__(take_over=True)
+        self.concat_tensor_names = ["context", "clip_feature", "y", "reference_latents"]
+    def process(
+        self, pipe: WanVideoPipeline_FaceSwap, inputs_shared, inputs_posi, inputs_nega
+    ):
+        if not inputs_shared["cfg_merge"]:
+            return inputs_shared, inputs_posi, inputs_nega
+        for name in self.concat_tensor_names:
+            tensor_posi = inputs_posi.get(name)
+            tensor_nega = inputs_nega.get(name)
+            tensor_shared = inputs_shared.get(name)
+            if tensor_posi is not None and tensor_nega is not None:
+                inputs_shared[name] = torch.concat((tensor_posi, tensor_nega), dim=0)
+            elif tensor_shared is not None:
+                inputs_shared[name] = torch.concat(
+                    (tensor_shared, tensor_shared), dim=0
+                )
+        inputs_posi.clear()
+        inputs_nega.clear()
+        return inputs_shared, inputs_posi, inputs_nega
+class TeaCache:
+    def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
+        self.num_inference_steps = num_inference_steps
+        self.step = 0
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.rel_l1_thresh = rel_l1_thresh
+        self.previous_residual = None
+        self.previous_hidden_states = None
+        self.coefficients_dict = {
+            "Wan2.1-T2V-1.3B": [
+                -5.21862437e04,
+                9.23041404e03,
+                -5.28275948e02,
+                1.36987616e01,
+                -4.99875664e-02,
+            ],
+            "Wan2.1-T2V-14B": [
+                -3.03318725e05,
+                4.90537029e04,
+                -2.65530556e03,
+                5.87365115e01,
+                -3.15583525e-01,
+            ],
+            "Wan2.1-I2V-14B-480P": [
+                2.57151496e05,
+                -3.54229917e04,
+                1.40286849e03,
+                -1.35890334e01,
+                1.32517977e-01,
+            ],
+            "Wan2.1-I2V-14B-720P": [
+                8.10705460e03,
+                2.13393892e03,
+                -3.72934672e02,
+                1.66203073e01,
+                -4.17769401e-02,
+            ],
+        }
+        if model_id not in self.coefficients_dict:
+            supported_model_ids = ", ".join([i for i in self.coefficients_dict])
+            raise ValueError(
+                f"{model_id} is not a supported TeaCache model id. Please choose a valid model id in ({supported_model_ids})."
+            )
+        self.coefficients = self.coefficients_dict[model_id]
+    def check(self, dit: WanModel, x, t_mod):
+        modulated_inp = t_mod.clone()
+        if self.step == 0 or self.step == self.num_inference_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = self.coefficients
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(
+                (
+                    (modulated_inp - self.previous_modulated_input).abs().mean()
+                    / self.previous_modulated_input.abs().mean()
+                )
+                .cpu()
+                .item()
+            )
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.step += 1
+        if self.step == self.num_inference_steps:
+            self.step = 0
+        if should_calc:
+            self.previous_hidden_states = x.clone()
+        return not should_calc
+    def store(self, hidden_states):
+        self.previous_residual = hidden_states - self.previous_hidden_states
+        self.previous_hidden_states = None
+    def update(self, hidden_states):
+        hidden_states = hidden_states + self.previous_residual
+        return hidden_states
+class TemporalTiler_BCTHW:
+    def __init__(self):
+        pass
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,)
+            )
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, T, _, _ = data.shape
+        t = self.build_1d_mask(T, is_bound[0], is_bound[1], border_width[0])
+        mask = repeat(t, "T -> 1 1 T 1 1")
+        return mask
+    def run(
+        self,
+        model_fn,
+        sliding_window_size,
+        sliding_window_stride,
+        computation_device,
+        computation_dtype,
+        model_kwargs,
+        tensor_names,
+        batch_size=None,
+    ):
+        tensor_names = [
+            tensor_name
+            for tensor_name in tensor_names
+            if model_kwargs.get(tensor_name) is not None
+        ]
+        tensor_dict = {
+            tensor_name: model_kwargs[tensor_name] for tensor_name in tensor_names
+        }
+        B, C, T, H, W = tensor_dict[tensor_names[0]].shape
+        if batch_size is not None:
+            B *= batch_size
+        data_device, data_dtype = (
+            tensor_dict[tensor_names[0]].device,
+            tensor_dict[tensor_names[0]].dtype,
+        )
+        value = torch.zeros((B, C, T, H, W), device=data_device, dtype=data_dtype)
+        weight = torch.zeros((1, 1, T, 1, 1), device=data_device, dtype=data_dtype)
+        for t in range(0, T, sliding_window_stride):
+            if (
+                t - sliding_window_stride >= 0
+                and t - sliding_window_stride + sliding_window_size >= T
+            ):
+                continue
+            t_ = min(t + sliding_window_size, T)
+            model_kwargs.update(
+                {
+                    tensor_name: tensor_dict[tensor_name][:, :, t:t_:, :].to(
+                        device=computation_device, dtype=computation_dtype
+                    )
+                    for tensor_name in tensor_names
+                }
+            )
+            model_output = model_fn(**model_kwargs).to(
+                device=data_device, dtype=data_dtype
+            )
+            mask = self.build_mask(
+                model_output,
+                is_bound=(t == 0, t_ == T),
+                border_width=(sliding_window_size - sliding_window_stride,),
+            ).to(device=data_device, dtype=data_dtype)
+            value[:, :, t:t_, :, :] += model_output * mask
+            weight[:, :, t:t_, :, :] += mask
+        value /= weight
+        model_kwargs.update(tensor_dict)
+        return value
+def model_fn_wan_video(
+    dit: WanModel,
+    motion_controller: WanMotionControllerModel = None,
+    vace: VaceWanModel = None,
+    latents: torch.Tensor = None,
+    timestep: torch.Tensor = None,
+    context: torch.Tensor = None,
+    clip_feature: Optional[torch.Tensor] = None,
+    y: Optional[torch.Tensor] = None,
+    reference_latents=None,
+    vace_context=None,
+    vace_scale=1.0,
+    tea_cache: TeaCache = None,
+    use_unified_sequence_parallel: bool = False,
+    motion_bucket_id: Optional[torch.Tensor] = None,
+    sliding_window_size: Optional[int] = None,
+    sliding_window_stride: Optional[int] = None,
+    cfg_merge: bool = False,
+    use_gradient_checkpointing: bool = False,
+    use_gradient_checkpointing_offload: bool = False,
+    control_camera_latents_input=None,
+    fuse_vae_embedding_in_latents: bool = False,
+    ip_image=None,
+    **kwargs,
+):
+    if sliding_window_size is not None and sliding_window_stride is not None:
+        model_kwargs = dict(
+            dit=dit,
+            motion_controller=motion_controller,
+            vace=vace,
+            latents=latents,
+            timestep=timestep,
+            context=context,
+            clip_feature=clip_feature,
+            y=y,
+            reference_latents=reference_latents,
+            vace_context=vace_context,
+            vace_scale=vace_scale,
+            tea_cache=tea_cache,
+            use_unified_sequence_parallel=use_unified_sequence_parallel,
+            motion_bucket_id=motion_bucket_id,
+        )
+        return TemporalTiler_BCTHW().run(
+            model_fn_wan_video,
+            sliding_window_size,
+            sliding_window_stride,
+            latents.device,
+            latents.dtype,
+            model_kwargs=model_kwargs,
+            tensor_names=["latents", "y"],
+            batch_size=2 if cfg_merge else 1,
+        )
+    if use_unified_sequence_parallel:
+        import torch.distributed as dist
+        from xfuser.core.distributed import (
+            get_sequence_parallel_rank,
+            get_sequence_parallel_world_size,
+            get_sp_group,
+        )
+    x_ip = None
+    t_mod_ip = None
+    # Timestep
+    if dit.seperated_timestep and fuse_vae_embedding_in_latents:
+        timestep = torch.concat(
+            [
+                torch.zeros(
+                    (1, latents.shape[3] * latents.shape[4] // 4),
+                    dtype=latents.dtype,
+                    device=latents.device,
+                ),
+                torch.ones(
+                    (latents.shape[2] - 1, latents.shape[3] * latents.shape[4] // 4),
+                    dtype=latents.dtype,
+                    device=latents.device,
+                )
+                * timestep,
+            ]
+        ).flatten()
+        t = dit.time_embedding(
+            sinusoidal_embedding_1d(dit.freq_dim, timestep).unsqueeze(0)
+        )
+        t_mod = dit.time_projection(t).unflatten(2, (6, dit.dim))
+    else:
+        t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
+        t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
+    if ip_image is not None:
+        timestep_ip = torch.zeros_like(timestep)  # [B] with 0s
+        t_ip = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep_ip))
+        t_mod_ip = dit.time_projection(t_ip).unflatten(1, (6, dit.dim))
+    # Motion Controller
+    if motion_bucket_id is not None and motion_controller is not None:
+        t_mod = t_mod + motion_controller(motion_bucket_id).unflatten(1, (6, dit.dim))
+    context = dit.text_embedding(context)
+    x = latents
+    # Merged cfg
+    if x.shape[0] != context.shape[0]:
+        x = torch.concat([x] * context.shape[0], dim=0)
+    if timestep.shape[0] != context.shape[0]:
+        timestep = torch.concat([timestep] * context.shape[0], dim=0)
+    # Image Embedding
+    if y is not None and dit.require_vae_embedding:
+        x = torch.cat([x, y], dim=1)
+    if clip_feature is not None and dit.require_clip_embedding:
+        clip_embdding = dit.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    # Add camera control
+    x, (f, h, w) = dit.patchify(x, control_camera_latents_input)
+    # Reference image
+    if reference_latents is not None:
+        if len(reference_latents.shape) == 5:
+            reference_latents = reference_latents[:, :, 0]
+        reference_latents = dit.ref_conv(reference_latents).flatten(2).transpose(1, 2)
+        x = torch.concat([reference_latents, x], dim=1)
+        f += 1
+    offset = 1
+    freqs = (
+        torch.cat(
+            [
+                dit.freqs[0][offset : f + offset].view(f, 1, 1, -1).expand(f, h, w, -1),
+                dit.freqs[1][offset : h + offset].view(1, h, 1, -1).expand(f, h, w, -1),
+                dit.freqs[2][offset : w + offset].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        )
+        .reshape(f * h * w, 1, -1)
+        .to(x.device)
+    )
+    ############################################################################################
+    if ip_image is not None:
+        x_ip, (f_ip, h_ip, w_ip) = dit.patchify(
+            ip_image
+        )  # x_ip [1, 1024, 5120] [B, N, D]   f_ip = 1  h_ip = 32  w_ip = 32
+        freqs_ip = (
+            torch.cat(
+                [
+                    dit.freqs[0][0].view(f_ip, 1, 1, -1).expand(f_ip, h_ip, w_ip, -1),
+                    dit.freqs[1][h + offset : h + offset + h_ip]
+                    .view(1, h_ip, 1, -1)
+                    .expand(f_ip, h_ip, w_ip, -1),
+                    dit.freqs[2][w + offset : w + offset + w_ip]
+                    .view(1, 1, w_ip, -1)
+                    .expand(f_ip, h_ip, w_ip, -1),
+                ],
+                dim=-1,
+            )
+            .reshape(f_ip * h_ip * w_ip, 1, -1)
+            .to(x_ip.device)
+        )
+        freqs_original = freqs
+        freqs = torch.cat([freqs, freqs_ip], dim=0)
+    ############################################################################################
+    else:
+        freqs_original = freqs
+    # TeaCache
+    if tea_cache is not None:
+        tea_cache_update = tea_cache.check(dit, x, t_mod)
+    else:
+        tea_cache_update = False
+    if vace_context is not None:
+        vace_hints = vace(x, vace_context, context, t_mod, freqs)
+    # blocks
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            x = torch.chunk(x, get_sequence_parallel_world_size(), dim=1)[
+                get_sequence_parallel_rank()
+            ]
+    if tea_cache_update:
+        x = tea_cache.update(x)
+    else:
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block_id, block in enumerate(dit.blocks):
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    x, x_ip = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        x_ip=x_ip,
+                        t_mod_ip=t_mod_ip,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                x, x_ip = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    context,
+                    t_mod,
+                    freqs,
+                    x_ip=x_ip,
+                    t_mod_ip=t_mod_ip,
+                    use_reentrant=False,
+                )
+            else:
+                x, x_ip = block(x, context, t_mod, freqs, x_ip=x_ip, t_mod_ip=t_mod_ip)
+            if vace_context is not None and block_id in vace.vace_layers_mapping:
+                current_vace_hint = vace_hints[vace.vace_layers_mapping[block_id]]
+                if (
+                    use_unified_sequence_parallel
+                    and dist.is_initialized()
+                    and dist.get_world_size() > 1
+                ):
+                    current_vace_hint = torch.chunk(
+                        current_vace_hint, get_sequence_parallel_world_size(), dim=1
+                    )[get_sequence_parallel_rank()]
+                x = x + current_vace_hint * vace_scale
+        if tea_cache is not None:
+            tea_cache.store(x)
+    x = dit.head(x, t)
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            x = get_sp_group().all_gather(x, dim=1)
+    # Remove reference latents
+    if reference_latents is not None:
+        x = x[:, reference_latents.shape[1] :]
+        f -= 1
+    x = dit.unpatchify(x, (f, h, w))
+    return x

preprocessor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .image_input_preprocessor import FaceProcessor
2	+ from .videomask_generator import VideoMaskGenerator

preprocessor/image_input_preprocessor.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import cv2
+import requests
+import torch
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+from insightface.app import FaceAnalysis
+from facexlib.parsing import init_parsing_model
+from torchvision.transforms.functional import normalize
+from typing import Union, Optional
+def _img2tensor(img: np.ndarray, bgr2rgb: bool = True) -> torch.Tensor:
+    if bgr2rgb:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = img.astype(np.float32) / 255.0
+    img = np.transpose(img, (2, 0, 1))
+    return torch.from_numpy(img)
+def _pad_to_square(img: np.ndarray, pad_color: int = 255) -> np.ndarray:
+    h, w, _ = img.shape
+    if h == w:
+        return img
+    if h > w:
+        pad_size = (h - w) // 2
+        padded_img = cv2.copyMakeBorder(
+            img,
+            0,
+            0,
+            pad_size,
+            h - w - pad_size,
+            cv2.BORDER_CONSTANT,
+            value=[pad_color] * 3,
+        )
+    else:
+        pad_size = (w - h) // 2
+        padded_img = cv2.copyMakeBorder(
+            img,
+            pad_size,
+            w - h - pad_size,
+            0,
+            0,
+            cv2.BORDER_CONSTANT,
+            value=[pad_color] * 3,
+        )
+    return padded_img
+class FaceProcessor:
+    def __init__(self, antelopv2_path=".", device: Optional[torch.device] = None):
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = device
+        providers = (
+            ["CUDAExecutionProvider"]
+            if self.device.type == "cuda"
+            else ["CPUExecutionProvider"]
+        )
+        self.app = FaceAnalysis(
+            name="antelopev2", root=antelopv2_path, providers=providers
+        )
+        self.app.prepare(ctx_id=0, det_size=(640, 640))
+        self.parsing_model = init_parsing_model(
+            model_name="bisenet", device=self.device
+        )
+        self.parsing_model.eval()
+        print("FaceProcessor initialized successfully.")
+    def process(
+        self,
+        image: Union[str, PIL.Image.Image],
+        resize_to: int = 512,
+        border_thresh: int = 10,
+        face_crop_scale: float = 1.5,
+        extra_input: bool = False,
+    ) -> PIL.Image.Image:
+        if isinstance(image, str):
+            if image.startswith("http://") or image.startswith("https://"):
+                image = PIL.Image.open(requests.get(image, stream=True, timeout=10).raw)
+            elif os.path.isfile(image):
+                image = PIL.Image.open(image)
+            else:
+                raise ValueError(
+                    f"Input string is not a valid URL or file path: {image}"
+                )
+        elif not isinstance(image, PIL.Image.Image):
+            raise TypeError(
+                "Input must be a file path, a URL, or a PIL.Image.Image object."
+            )
+        image = PIL.ImageOps.exif_transpose(image).convert("RGB")
+        frame = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        faces = self.app.get(frame)
+        h, w, _ = frame.shape
+        image_to_process = None
+        if not faces:
+            print(
+                "[Warning] No face detected. Using the whole image, padded to square."
+            )
+            image_to_process = _pad_to_square(frame, pad_color=255)
+        else:
+            largest_face = max(
+                faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1])
+            )
+            x1, y1, x2, y2 = map(int, largest_face.bbox)
+            is_close_to_border = (
+                x1 <= border_thresh
+                and y1 <= border_thresh
+                and x2 >= w - border_thresh
+                and y2 >= h - border_thresh
+            )
+            if is_close_to_border:
+                print(
+                    "[Info] Face is close to border, padding original image to square."
+                )
+                image_to_process = _pad_to_square(frame, pad_color=255)
+            else:
+                cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
+                side = int(max(x2 - x1, y2 - y1) * face_crop_scale)
+                half = side // 2
+                left = max(cx - half, 0)
+                top = max(cy - half, 0)
+                right = min(cx + half, w)
+                bottom = min(cy + half, h)
+                cropped_face = frame[top:bottom, left:right]
+                image_to_process = _pad_to_square(cropped_face, pad_color=255)
+        image_resized = cv2.resize(
+            image_to_process, (resize_to, resize_to), interpolation=cv2.INTER_AREA
+        )
+        face_tensor = (
+            _img2tensor(image_resized, bgr2rgb=True).unsqueeze(0).to(self.device)
+        )
+        with torch.no_grad():
+            normalized_face = normalize(face_tensor, [0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
+            parsing_out = self.parsing_model(normalized_face)[0]
+            parsing_mask = parsing_out.argmax(dim=1, keepdim=True)
+        background_mask_np = (parsing_mask.squeeze().cpu().numpy() == 0).astype(
+            np.uint8
+        )
+        white_background = np.ones_like(image_resized, dtype=np.uint8) * 255
+        mask_3channel = cv2.cvtColor(background_mask_np * 255, cv2.COLOR_GRAY2BGR)
+        result_img_bgr = np.where(mask_3channel == 255, white_background, image_resized)
+        result_img_rgb = cv2.cvtColor(result_img_bgr, cv2.COLOR_BGR2RGB)
+        img_white_bg = PIL.Image.fromarray(result_img_rgb)
+        if extra_input:
+            # 2. Create image with transparent background (new logic)
+            # Create an alpha channel: 255 for foreground (not background), 0 for background
+            alpha_channel = (parsing_mask.squeeze().cpu().numpy() != 0).astype(
+                np.uint8
+            ) * 255
+            # Convert the resized BGR image to RGB
+            image_resized_rgb = cv2.cvtColor(image_resized, cv2.COLOR_BGR2RGB)
+            # Stack RGB channels with the new alpha channel
+            rgba_image = np.dstack((image_resized_rgb, alpha_channel))
+            # Create PIL image from the RGBA numpy array
+            img_transparent_bg = PIL.Image.fromarray(rgba_image, "RGBA")
+            return img_white_bg, img_transparent_bg
+        else:
+            return img_white_bg

preprocessor/videomask_generator.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import torch
+import cv2
+import numpy as np
+from torchvision.transforms.functional import normalize
+from tqdm import tqdm
+from PIL import Image, ImageOps
+import random
+import os
+import requests
+from insightface.app import FaceAnalysis
+from facexlib.parsing import init_parsing_model
+from typing import Union, Optional, Tuple, List
+# --- Helper Functions (Unchanged) ---
+def tensor_to_cv2_img(tensor_frame: torch.Tensor) -> np.ndarray:
+    """Converts a single RGB torch tensor to a BGR OpenCV image."""
+    img_np = (tensor_frame.cpu().numpy() * 255).astype(np.uint8)
+    return cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
+def tensor_to_cv2_bgra_img(tensor_frame: torch.Tensor) -> np.ndarray:
+    """Converts a single RGBA torch tensor to a BGRA OpenCV image."""
+    if tensor_frame.shape[2] != 4:
+        raise ValueError("Input tensor must be an RGBA image with 4 channels.")
+    img_np = (tensor_frame.cpu().numpy() * 255).astype(np.uint8)
+    return cv2.cvtColor(img_np, cv2.COLOR_RGBA2BGRA)
+def pil_to_tensor(image: Image.Image) -> torch.Tensor:
+    """Converts a PIL image to a torch tensor."""
+    return torch.from_numpy(np.array(image).astype(np.float32) / 255.0)
+class VideoMaskGenerator:
+    def __init__(self, antelopv2_path=".", device: Optional[torch.device] = None):
+        if device is None:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        else:
+            self.device = device
+        print(f"Using device: {self.device}")
+        providers = ["CUDAExecutionProvider"] if self.device.type == "cuda" else ["CPUExecutionProvider"]
+        # Initialize face detection and landmark model (antelopev2 provides both)
+        self.detection_model = FaceAnalysis(name="antelopev2", root=antelopv2_path, providers=providers)
+        self.detection_model.prepare(ctx_id=0, det_size=(640, 640))
+        # Initialize face parsing model
+        self.parsing_model = init_parsing_model(model_name="bisenet", device=self.device)
+        self.parsing_model.eval()
+        print("FaceProcessor initialized successfully.")
+    def process(
+        self,
+        video_path: str,
+        face_image: Union[str, Image.Image],
+        confidence_threshold: float = 0.5,
+        face_crop_scale: float = 1.5,
+        dilation_kernel_size: int = 10,
+        feather_amount: int = 21,
+        random_horizontal_flip_chance: float = 0.0,
+        match_angle_and_size: bool = True
+    ) -> Tuple[np.ndarray, np.ndarray, int, int, int]:
+        """
+        Processes a video to replace a face with a provided face image.
+        Args:
+            video_path (str): Path to the input video file.
+            face_image (Union[str, Image.Image]): Path or PIL image of the face to paste.
+            confidence_threshold (float): Confidence threshold for face detection.
+            face_crop_scale (float): Scale factor for cropping the detected face box.
+            dilation_kernel_size (int): Kernel size for mask dilation.
+            feather_amount (int): Amount of feathering for the mask edges.
+            random_horizontal_flip_chance (float): Chance to flip the source face horizontally.
+            match_angle_and_size (bool): Whether to use landmark matching for rotation and scale.
+        Returns:
+            Tuple[np.ndarray, np.ndarray, int, int, int]:
+                - Processed video as a numpy array (F, H, W, C).
+                - Generated masks as a numpy array (F, H, W).
+                - Width of the processed video.
+                - Height of the processed video.
+                - Number of frames in the processed video.
+        """
+        # --- Video Pre-processing ---
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found at: {video_path}")
+        cap = cv2.VideoCapture(video_path)
+        frames = []
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        cap.release()
+        if not frames:
+            raise ValueError("Could not read any frames from the video.")
+        video_np = np.array(frames)
+        h, w = video_np.shape[1], video_np.shape[2]
+        new_h, new_w = (h // 16) * 16, (w // 16) * 16
+        y_start = (h - new_h) // 2
+        x_start = (w - new_w) // 2
+        video_cropped = video_np[:, y_start:y_start+new_h, x_start:x_start+new_w, :]
+        num_frames = video_cropped.shape[0]
+        target_frames = (num_frames // 4) * 4 + 1
+        video_trimmed = video_cropped[:target_frames]
+        final_h, final_w, final_frames = video_trimmed.shape[1], video_trimmed.shape[2], video_trimmed.shape[0]
+        print(f"Video pre-processed: {final_w}x{final_h}, {final_frames} frames.")
+        # --- Face Image Pre-processing & Source Landmark Extraction ---
+        if isinstance(face_image, str):
+            if face_image.startswith("http"):
+                face_image = Image.open(requests.get(face_image, stream=True, timeout=10).raw)
+            else:
+                face_image = Image.open(face_image)
+        face_image = ImageOps.exif_transpose(face_image).convert("RGBA")
+        face_rgba_tensor = pil_to_tensor(face_image)
+        face_to_paste_cv2 = tensor_to_cv2_bgra_img(face_rgba_tensor)
+        source_kpts = None
+        if match_angle_and_size:
+            # Use insightface (antelopev2) to get landmarks from the source face image
+            source_face_bgr = cv2.cvtColor(face_to_paste_cv2, cv2.COLOR_BGRA2BGR)
+            source_faces = self.detection_model.get(source_face_bgr)
+            if source_faces:
+                # Use the landmarks from the first (and likely only) detected face
+                source_kpts = source_faces[0].kps
+            else:
+                print("[Warning] No face or landmarks found in source image. Disabling angle matching.")
+                match_angle_and_size = False
+        face_to_paste_pil = Image.fromarray((face_rgba_tensor.cpu().numpy() * 255).astype(np.uint8), 'RGBA')
+        # --- Main Processing Loop ---
+        processed_frames_list = []
+        mask_list = []
+        for i in tqdm(range(final_frames), desc="Pasting face onto frames"):
+            frame_rgb = video_trimmed[i]
+            frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+            # Use insightface for detection and landmarks
+            faces = self.detection_model.get(frame_bgr)
+            pasted = False
+            final_mask = np.zeros((final_h, final_w), dtype=np.uint8)
+            if faces:
+                largest_face = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]))
+                if largest_face.det_score > confidence_threshold:
+                    # **MODIFIED BLOCK**: Use insightface landmarks for affine transform
+                    if match_angle_and_size and source_kpts is not None:
+                        target_kpts = largest_face.kps # Get landmarks directly from the detected face
+                        # Estimate the transformation matrix
+                        M, _ = cv2.estimateAffinePartial2D(source_kpts, target_kpts, method=cv2.LMEDS)
+                        if M is not None:
+                            # Split the RGBA source face for separate warping
+                            b, g, r, a = cv2.split(face_to_paste_cv2)
+                            source_rgb_cv2 = cv2.merge([r, g, b])
+                            # Warp the face and its alpha channel
+                            warped_face = cv2.warpAffine(source_rgb_cv2, M, (final_w, final_h))
+                            warped_alpha = cv2.warpAffine(a, M, (final_w, final_h))
+                            # Blend the warped face onto the frame using the warped alpha channel
+                            alpha_float = warped_alpha.astype(np.float32) / 255.0
+                            alpha_expanded = np.expand_dims(alpha_float, axis=2)
+                            frame_rgb = (1.0 - alpha_expanded) * frame_rgb + alpha_expanded * warped_face
+                            frame_rgb = frame_rgb.astype(np.uint8)
+                            final_mask = warped_alpha
+                            pasted = True
+                    # Fallback to simple box-pasting if angle matching is off or fails
+                    if not pasted:
+                        x1, y1, x2, y2 = map(int, largest_face.bbox)
+                        center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2
+                        side_len = int(max(x2 - x1, y2 - y1) * face_crop_scale)
+                        half_side = side_len // 2
+                        crop_y1, crop_x1 = max(center_y - half_side, 0), max(center_x - half_side, 0)
+                        crop_y2, crop_x2 = min(center_y + half_side, final_h), min(center_x + half_side, final_w)
+                        box_w, box_h = crop_x2 - crop_x1, crop_y2 - crop_y1
+                        if box_w > 0 and box_h > 0:
+                            source_img = face_to_paste_pil.copy()
+                            if random.random() < random_horizontal_flip_chance:
+                                source_img = source_img.transpose(Image.FLIP_LEFT_RIGHT)
+                            face_resized = source_img.resize((box_w, box_h), Image.Resampling.LANCZOS)
+                            target_frame_pil = Image.fromarray(frame_rgb)
+                            # --- Mask Generation using BiSeNet ---
+                            face_crop_bgr = cv2.cvtColor(frame_rgb[crop_y1:crop_y2, crop_x1:crop_x2], cv2.COLOR_RGB2BGR)
+                            if face_crop_bgr.size > 0:
+                                face_resized_512 = cv2.resize(face_crop_bgr, (512, 512), interpolation=cv2.INTER_AREA)
+                                face_rgb_512 = cv2.cvtColor(face_resized_512, cv2.COLOR_BGR2RGB)
+                                face_tensor_in = torch.from_numpy(face_rgb_512.astype(np.float32) / 255.0).permute(2, 0, 1).unsqueeze(0).to(self.device)
+                                with torch.no_grad():
+                                    normalized_face = normalize(face_tensor_in, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+                                    parsing_map = self.parsing_model(normalized_face)[0].argmax(dim=1, keepdim=True)
+                                parsing_map_np = parsing_map.squeeze().cpu().numpy().astype(np.uint8)
+                                parts_to_include = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] # All face parts
+                                final_mask_512 = np.isin(parsing_map_np, parts_to_include).astype(np.uint8) * 255
+                                if dilation_kernel_size > 0:
+                                    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dilation_kernel_size, dilation_kernel_size))
+                                    final_mask_512 = cv2.dilate(final_mask_512, kernel, iterations=1)
+                                if feather_amount > 0:
+                                    if feather_amount % 2 == 0: feather_amount += 1
+                                    final_mask_512 = cv2.GaussianBlur(final_mask_512, (feather_amount, feather_amount), 0)
+                                mask_resized_to_crop = cv2.resize(final_mask_512, (box_w, box_h), interpolation=cv2.INTER_LINEAR)
+                                generated_mask_pil = Image.fromarray(mask_resized_to_crop, mode='L')
+                                target_frame_pil.paste(face_resized, (crop_x1, crop_y1), mask=generated_mask_pil)
+                                frame_rgb = np.array(target_frame_pil)
+                                final_mask[crop_y1:crop_y2, crop_x1:crop_x2] = mask_resized_to_crop
+            processed_frames_list.append(frame_rgb)
+            mask_list.append(final_mask)
+        output_video = np.stack(processed_frames_list)
+        # Ensure mask has a channel dimension for consistency
+        output_masks = np.stack(mask_list)[..., np.newaxis]
+        return (output_video, output_masks, final_w, final_h, final_frames)

prompters/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .prompt_refiners import Translator, BeautifulPrompt, QwenPrompt
+from .omost import OmostPromter
+from .wan_prompter import WanPrompter

prompters/base_prompter.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from models.model_manager import ModelManager
+import torch
+def tokenize_long_prompt(tokenizer, prompt, max_length=None):
+    # Get model_max_length from self.tokenizer
+    length = tokenizer.model_max_length if max_length is None else max_length
+    # To avoid the warning. set self.tokenizer.model_max_length to +oo.
+    tokenizer.model_max_length = 99999999
+    # Tokenize it!
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    # Determine the real length.
+    max_length = (input_ids.shape[1] + length - 1) // length * length
+    # Restore tokenizer.model_max_length
+    tokenizer.model_max_length = length
+    # Tokenize it again with fixed length.
+    input_ids = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=max_length,
+        truncation=True,
+    ).input_ids
+    # Reshape input_ids to fit the text encoder.
+    num_sentence = input_ids.shape[1] // length
+    input_ids = input_ids.reshape((num_sentence, length))
+    return input_ids
+class BasePrompter:
+    def __init__(self):
+        self.refiners = []
+        self.extenders = []
+    def load_prompt_refiners(self, model_manager: ModelManager, refiner_classes=[]):
+        for refiner_class in refiner_classes:
+            refiner = refiner_class.from_model_manager(model_manager)
+            self.refiners.append(refiner)
+    def load_prompt_extenders(self, model_manager: ModelManager, extender_classes=[]):
+        for extender_class in extender_classes:
+            extender = extender_class.from_model_manager(model_manager)
+            self.extenders.append(extender)
+    @torch.no_grad()
+    def process_prompt(self, prompt, positive=True):
+        if isinstance(prompt, list):
+            prompt = [
+                self.process_prompt(prompt_, positive=positive) for prompt_ in prompt
+            ]
+        else:
+            for refiner in self.refiners:
+                prompt = refiner(prompt, positive=positive)
+        return prompt
+    @torch.no_grad()
+    def extend_prompt(self, prompt: str, positive=True):
+        extended_prompt = dict(prompt=prompt)
+        for extender in self.extenders:
+            extended_prompt = extender(extended_prompt)
+        return extended_prompt

prompters/omost.py ADDED Viewed

	@@ -0,0 +1,472 @@

+from transformers import AutoTokenizer, TextIteratorStreamer
+import difflib
+import torch
+import numpy as np
+import re
+from models.model_manager import ModelManager
+from PIL import Image
+valid_colors = {  # r, g, b
+    "aliceblue": (240, 248, 255),
+    "antiquewhite": (250, 235, 215),
+    "aqua": (0, 255, 255),
+    "aquamarine": (127, 255, 212),
+    "azure": (240, 255, 255),
+    "beige": (245, 245, 220),
+    "bisque": (255, 228, 196),
+    "black": (0, 0, 0),
+    "blanchedalmond": (255, 235, 205),
+    "blue": (0, 0, 255),
+    "blueviolet": (138, 43, 226),
+    "brown": (165, 42, 42),
+    "burlywood": (222, 184, 135),
+    "cadetblue": (95, 158, 160),
+    "chartreuse": (127, 255, 0),
+    "chocolate": (210, 105, 30),
+    "coral": (255, 127, 80),
+    "cornflowerblue": (100, 149, 237),
+    "cornsilk": (255, 248, 220),
+    "crimson": (220, 20, 60),
+    "cyan": (0, 255, 255),
+    "darkblue": (0, 0, 139),
+    "darkcyan": (0, 139, 139),
+    "darkgoldenrod": (184, 134, 11),
+    "darkgray": (169, 169, 169),
+    "darkgrey": (169, 169, 169),
+    "darkgreen": (0, 100, 0),
+    "darkkhaki": (189, 183, 107),
+    "darkmagenta": (139, 0, 139),
+    "darkolivegreen": (85, 107, 47),
+    "darkorange": (255, 140, 0),
+    "darkorchid": (153, 50, 204),
+    "darkred": (139, 0, 0),
+    "darksalmon": (233, 150, 122),
+    "darkseagreen": (143, 188, 143),
+    "darkslateblue": (72, 61, 139),
+    "darkslategray": (47, 79, 79),
+    "darkslategrey": (47, 79, 79),
+    "darkturquoise": (0, 206, 209),
+    "darkviolet": (148, 0, 211),
+    "deeppink": (255, 20, 147),
+    "deepskyblue": (0, 191, 255),
+    "dimgray": (105, 105, 105),
+    "dimgrey": (105, 105, 105),
+    "dodgerblue": (30, 144, 255),
+    "firebrick": (178, 34, 34),
+    "floralwhite": (255, 250, 240),
+    "forestgreen": (34, 139, 34),
+    "fuchsia": (255, 0, 255),
+    "gainsboro": (220, 220, 220),
+    "ghostwhite": (248, 248, 255),
+    "gold": (255, 215, 0),
+    "goldenrod": (218, 165, 32),
+    "gray": (128, 128, 128),
+    "grey": (128, 128, 128),
+    "green": (0, 128, 0),
+    "greenyellow": (173, 255, 47),
+    "honeydew": (240, 255, 240),
+    "hotpink": (255, 105, 180),
+    "indianred": (205, 92, 92),
+    "indigo": (75, 0, 130),
+    "ivory": (255, 255, 240),
+    "khaki": (240, 230, 140),
+    "lavender": (230, 230, 250),
+    "lavenderblush": (255, 240, 245),
+    "lawngreen": (124, 252, 0),
+    "lemonchiffon": (255, 250, 205),
+    "lightblue": (173, 216, 230),
+    "lightcoral": (240, 128, 128),
+    "lightcyan": (224, 255, 255),
+    "lightgoldenrodyellow": (250, 250, 210),
+    "lightgray": (211, 211, 211),
+    "lightgrey": (211, 211, 211),
+    "lightgreen": (144, 238, 144),
+    "lightpink": (255, 182, 193),
+    "lightsalmon": (255, 160, 122),
+    "lightseagreen": (32, 178, 170),
+    "lightskyblue": (135, 206, 250),
+    "lightslategray": (119, 136, 153),
+    "lightslategrey": (119, 136, 153),
+    "lightsteelblue": (176, 196, 222),
+    "lightyellow": (255, 255, 224),
+    "lime": (0, 255, 0),
+    "limegreen": (50, 205, 50),
+    "linen": (250, 240, 230),
+    "magenta": (255, 0, 255),
+    "maroon": (128, 0, 0),
+    "mediumaquamarine": (102, 205, 170),
+    "mediumblue": (0, 0, 205),
+    "mediumorchid": (186, 85, 211),
+    "mediumpurple": (147, 112, 219),
+    "mediumseagreen": (60, 179, 113),
+    "mediumslateblue": (123, 104, 238),
+    "mediumspringgreen": (0, 250, 154),
+    "mediumturquoise": (72, 209, 204),
+    "mediumvioletred": (199, 21, 133),
+    "midnightblue": (25, 25, 112),
+    "mintcream": (245, 255, 250),
+    "mistyrose": (255, 228, 225),
+    "moccasin": (255, 228, 181),
+    "navajowhite": (255, 222, 173),
+    "navy": (0, 0, 128),
+    "navyblue": (0, 0, 128),
+    "oldlace": (253, 245, 230),
+    "olive": (128, 128, 0),
+    "olivedrab": (107, 142, 35),
+    "orange": (255, 165, 0),
+    "orangered": (255, 69, 0),
+    "orchid": (218, 112, 214),
+    "palegoldenrod": (238, 232, 170),
+    "palegreen": (152, 251, 152),
+    "paleturquoise": (175, 238, 238),
+    "palevioletred": (219, 112, 147),
+    "papayawhip": (255, 239, 213),
+    "peachpuff": (255, 218, 185),
+    "peru": (205, 133, 63),
+    "pink": (255, 192, 203),
+    "plum": (221, 160, 221),
+    "powderblue": (176, 224, 230),
+    "purple": (128, 0, 128),
+    "rebeccapurple": (102, 51, 153),
+    "red": (255, 0, 0),
+    "rosybrown": (188, 143, 143),
+    "royalblue": (65, 105, 225),
+    "saddlebrown": (139, 69, 19),
+    "salmon": (250, 128, 114),
+    "sandybrown": (244, 164, 96),
+    "seagreen": (46, 139, 87),
+    "seashell": (255, 245, 238),
+    "sienna": (160, 82, 45),
+    "silver": (192, 192, 192),
+    "skyblue": (135, 206, 235),
+    "slateblue": (106, 90, 205),
+    "slategray": (112, 128, 144),
+    "slategrey": (112, 128, 144),
+    "snow": (255, 250, 250),
+    "springgreen": (0, 255, 127),
+    "steelblue": (70, 130, 180),
+    "tan": (210, 180, 140),
+    "teal": (0, 128, 128),
+    "thistle": (216, 191, 216),
+    "tomato": (255, 99, 71),
+    "turquoise": (64, 224, 208),
+    "violet": (238, 130, 238),
+    "wheat": (245, 222, 179),
+    "white": (255, 255, 255),
+    "whitesmoke": (245, 245, 245),
+    "yellow": (255, 255, 0),
+    "yellowgreen": (154, 205, 50),
+}
+valid_locations = {  # x, y in 90*90
+    "in the center": (45, 45),
+    "on the left": (15, 45),
+    "on the right": (75, 45),
+    "on the top": (45, 15),
+    "on the bottom": (45, 75),
+    "on the top-left": (15, 15),
+    "on the top-right": (75, 15),
+    "on the bottom-left": (15, 75),
+    "on the bottom-right": (75, 75),
+}
+valid_offsets = {  # x, y in 90*90
+    "no offset": (0, 0),
+    "slightly to the left": (-10, 0),
+    "slightly to the right": (10, 0),
+    "slightly to the upper": (0, -10),
+    "slightly to the lower": (0, 10),
+    "slightly to the upper-left": (-10, -10),
+    "slightly to the upper-right": (10, -10),
+    "slightly to the lower-left": (-10, 10),
+    "slightly to the lower-right": (10, 10),
+}
+valid_areas = {  # w, h in 90*90
+    "a small square area": (50, 50),
+    "a small vertical area": (40, 60),
+    "a small horizontal area": (60, 40),
+    "a medium-sized square area": (60, 60),
+    "a medium-sized vertical area": (50, 80),
+    "a medium-sized horizontal area": (80, 50),
+    "a large square area": (70, 70),
+    "a large vertical area": (60, 90),
+    "a large horizontal area": (90, 60),
+}
+def safe_str(x):
+    return x.strip(",. ") + "."
+def closest_name(input_str, options):
+    input_str = input_str.lower()
+    closest_match = difflib.get_close_matches(
+        input_str, list(options.keys()), n=1, cutoff=0.5
+    )
+    assert isinstance(closest_match, list) and len(closest_match) > 0, (
+        f"The value [{input_str}] is not valid!"
+    )
+    result = closest_match[0]
+    if result != input_str:
+        print(f"Automatically corrected [{input_str}] -> [{result}].")
+    return result
+class Canvas:
+    @staticmethod
+    def from_bot_response(response: str):
+        matched = re.search(r"```python\n(.*?)\n```", response, re.DOTALL)
+        assert matched, "Response does not contain codes!"
+        code_content = matched.group(1)
+        assert "canvas = Canvas()" in code_content, (
+            "Code block must include valid canvas var!"
+        )
+        local_vars = {"Canvas": Canvas}
+        exec(code_content, {}, local_vars)
+        canvas = local_vars.get("canvas", None)
+        assert isinstance(canvas, Canvas), "Code block must produce valid canvas var!"
+        return canvas
+    def __init__(self):
+        self.components = []
+        self.color = None
+        self.record_tags = True
+        self.prefixes = []
+        self.suffixes = []
+        return
+    def set_global_description(
+        self,
+        description: str,
+        detailed_descriptions: list,
+        tags: str,
+        HTML_web_color_name: str,
+    ):
+        assert isinstance(description, str), "Global description is not valid!"
+        assert isinstance(detailed_descriptions, list) and all(
+            isinstance(item, str) for item in detailed_descriptions
+        ), "Global detailed_descriptions is not valid!"
+        assert isinstance(tags, str), "Global tags is not valid!"
+        HTML_web_color_name = closest_name(HTML_web_color_name, valid_colors)
+        self.color = np.array([[valid_colors[HTML_web_color_name]]], dtype=np.uint8)
+        self.prefixes = [description]
+        self.suffixes = detailed_descriptions
+        if self.record_tags:
+            self.suffixes = self.suffixes + [tags]
+        self.prefixes = [safe_str(x) for x in self.prefixes]
+        self.suffixes = [safe_str(x) for x in self.suffixes]
+        return
+    def add_local_description(
+        self,
+        location: str,
+        offset: str,
+        area: str,
+        distance_to_viewer: float,
+        description: str,
+        detailed_descriptions: list,
+        tags: str,
+        atmosphere: str,
+        style: str,
+        quality_meta: str,
+        HTML_web_color_name: str,
+    ):
+        assert isinstance(description, str), "Local description is wrong!"
+        assert (
+            isinstance(distance_to_viewer, (int, float)) and distance_to_viewer > 0
+        ), f"The distance_to_viewer for [{description}] is not positive float number!"
+        assert isinstance(detailed_descriptions, list) and all(
+            isinstance(item, str) for item in detailed_descriptions
+        ), f"The detailed_descriptions for [{description}] is not valid!"
+        assert isinstance(tags, str), f"The tags for [{description}] is not valid!"
+        assert isinstance(atmosphere, str), (
+            f"The atmosphere for [{description}] is not valid!"
+        )
+        assert isinstance(style, str), f"The style for [{description}] is not valid!"
+        assert isinstance(quality_meta, str), (
+            f"The quality_meta for [{description}] is not valid!"
+        )
+        location = closest_name(location, valid_locations)
+        offset = closest_name(offset, valid_offsets)
+        area = closest_name(area, valid_areas)
+        HTML_web_color_name = closest_name(HTML_web_color_name, valid_colors)
+        xb, yb = valid_locations[location]
+        xo, yo = valid_offsets[offset]
+        w, h = valid_areas[area]
+        rect = (yb + yo - h // 2, yb + yo + h // 2, xb + xo - w // 2, xb + xo + w // 2)
+        rect = [max(0, min(90, i)) for i in rect]
+        color = np.array([[valid_colors[HTML_web_color_name]]], dtype=np.uint8)
+        prefixes = self.prefixes + [description]
+        suffixes = detailed_descriptions
+        if self.record_tags:
+            suffixes = suffixes + [tags, atmosphere, style, quality_meta]
+        prefixes = [safe_str(x) for x in prefixes]
+        suffixes = [safe_str(x) for x in suffixes]
+        self.components.append(
+            dict(
+                rect=rect,
+                distance_to_viewer=distance_to_viewer,
+                color=color,
+                prefixes=prefixes,
+                suffixes=suffixes,
+                location=location,
+            )
+        )
+        return
+    def process(self):
+        # sort components
+        self.components = sorted(
+            self.components, key=lambda x: x["distance_to_viewer"], reverse=True
+        )
+        # compute initial latent
+        # print(self.color)
+        initial_latent = np.zeros(shape=(90, 90, 3), dtype=np.float32) + self.color
+        for component in self.components:
+            a, b, c, d = component["rect"]
+            initial_latent[a:b, c:d] = (
+                0.7 * component["color"] + 0.3 * initial_latent[a:b, c:d]
+            )
+        initial_latent = initial_latent.clip(0, 255).astype(np.uint8)
+        # compute conditions
+        bag_of_conditions = [
+            dict(
+                mask=np.ones(shape=(90, 90), dtype=np.float32),
+                prefixes=self.prefixes,
+                suffixes=self.suffixes,
+                location="full",
+            )
+        ]
+        for i, component in enumerate(self.components):
+            a, b, c, d = component["rect"]
+            m = np.zeros(shape=(90, 90), dtype=np.float32)
+            m[a:b, c:d] = 1.0
+            bag_of_conditions.append(
+                dict(
+                    mask=m,
+                    prefixes=component["prefixes"],
+                    suffixes=component["suffixes"],
+                    location=component["location"],
+                )
+            )
+        return dict(
+            initial_latent=initial_latent,
+            bag_of_conditions=bag_of_conditions,
+        )
+class OmostPromter(torch.nn.Module):
+    def __init__(self, model=None, tokenizer=None, template="", device="cpu"):
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        if template == "":
+            template = r"""You are a helpful AI assistant to compose images using the below python class `Canvas`:
+            ```python
+            class Canvas:
+                def set_global_description(self, description: str, detailed_descriptions: list[str], tags: str, HTML_web_color_name: str):
+                    pass
+                def add_local_description(self, location: str, offset: str, area: str, distance_to_viewer: float, description: str, detailed_descriptions: list[str], tags: str, atmosphere: str, style: str, quality_meta: str, HTML_web_color_name: str):
+                    assert location in ["in the center", "on the left", "on the right", "on the top", "on the bottom", "on the top-left", "on the top-right", "on the bottom-left", "on the bottom-right"]
+                    assert offset in ["no offset", "slightly to the left", "slightly to the right", "slightly to the upper", "slightly to the lower", "slightly to the upper-left", "slightly to the upper-right", "slightly to the lower-left", "slightly to the lower-right"]
+                    assert area in ["a small square area", "a small vertical area", "a small horizontal area", "a medium-sized square area", "a medium-sized vertical area", "a medium-sized horizontal area", "a large square area", "a large vertical area", "a large horizontal area"]
+                    assert distance_to_viewer > 0
+                    pass
+            ```"""
+        self.template = template
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager):
+        model, model_path = model_manager.fetch_model(
+            "omost_prompt", require_model_path=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        omost = OmostPromter(
+            model=model, tokenizer=tokenizer, device=model_manager.device
+        )
+        return omost
+    def __call__(self, prompt_dict: dict):
+        raw_prompt = prompt_dict["prompt"]
+        conversation = [{"role": "system", "content": self.template}]
+        conversation.append({"role": "user", "content": raw_prompt})
+        input_ids = self.tokenizer.apply_chat_template(
+            conversation, return_tensors="pt", add_generation_prompt=True
+        ).to(self.device)
+        streamer = TextIteratorStreamer(
+            self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+        )
+        attention_mask = torch.ones(
+            input_ids.shape, dtype=torch.bfloat16, device=self.device
+        )
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            streamer=streamer,
+            # stopping_criteria=stopping_criteria,
+            # max_new_tokens=max_new_tokens,
+            do_sample=True,
+            attention_mask=attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            # temperature=temperature,
+            # top_p=top_p,
+        )
+        self.model.generate(**generate_kwargs)
+        outputs = []
+        for text in streamer:
+            outputs.append(text)
+        llm_outputs = "".join(outputs)
+        canvas = Canvas.from_bot_response(llm_outputs)
+        canvas_output = canvas.process()
+        prompts = [
+            " ".join(_["prefixes"] + _["suffixes"][:2])
+            for _ in canvas_output["bag_of_conditions"]
+        ]
+        canvas_output["prompt"] = prompts[0]
+        canvas_output["prompts"] = prompts[1:]
+        raw_masks = [_["mask"] for _ in canvas_output["bag_of_conditions"]]
+        masks = []
+        for mask in raw_masks:
+            mask[mask > 0.5] = 255
+            mask = np.stack([mask] * 3, axis=-1).astype("uint8")
+            masks.append(Image.fromarray(mask))
+        canvas_output["masks"] = masks
+        prompt_dict.update(canvas_output)
+        print(f"Your prompt is extended by Omost:\n")
+        cnt = 0
+        for component, pmt in zip(canvas_output["bag_of_conditions"], prompts):
+            loc = component["location"]
+            cnt += 1
+            print(f"Component {cnt} - Location : {loc}\nPrompt:{pmt}\n")
+        return prompt_dict

prompters/prompt_refiners.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from transformers import AutoTokenizer
+from models.model_manager import ModelManager
+import torch
+from .omost import OmostPromter
+class BeautifulPrompt(torch.nn.Module):
+    def __init__(self, tokenizer_path=None, model=None, template=""):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = model
+        self.template = template
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager):
+        model, model_path = model_manager.fetch_model(
+            "beautiful_prompt", require_model_path=True
+        )
+        template = "Instruction: Give a simple description of the image to generate a drawing prompt.\nInput: {raw_prompt}\nOutput:"
+        if model_path.endswith("v2"):
+            template = """Converts a simple image description into a prompt. \
+Prompts are formatted as multiple related tags separated by commas, plus you can use () to increase the weight, [] to decrease the weight, \
+or use a number to specify the weight. You should add appropriate words to make the images described in the prompt more aesthetically pleasing, \
+but make sure there is a correlation between the input and output.\n\
+### Input: {raw_prompt}\n### Output:"""
+        beautiful_prompt = BeautifulPrompt(
+            tokenizer_path=model_path, model=model, template=template
+        )
+        return beautiful_prompt
+    def __call__(self, raw_prompt, positive=True, **kwargs):
+        if positive:
+            model_input = self.template.format(raw_prompt=raw_prompt)
+            input_ids = self.tokenizer.encode(model_input, return_tensors="pt").to(
+                self.model.device
+            )
+            outputs = self.model.generate(
+                input_ids,
+                max_new_tokens=384,
+                do_sample=True,
+                temperature=0.9,
+                top_k=50,
+                top_p=0.95,
+                repetition_penalty=1.1,
+                num_return_sequences=1,
+            )
+            prompt = (
+                raw_prompt
+                + ", "
+                + self.tokenizer.batch_decode(
+                    outputs[:, input_ids.size(1) :], skip_special_tokens=True
+                )[0].strip()
+            )
+            print(f"Your prompt is refined by BeautifulPrompt: {prompt}")
+            return prompt
+        else:
+            return raw_prompt
+class QwenPrompt(torch.nn.Module):
+    # This class leverages the open-source Qwen model to translate Chinese prompts into English,
+    #    with an integrated optimization mechanism for enhanced translation quality.
+    def __init__(self, tokenizer_path=None, model=None, system_prompt=""):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = model
+        self.system_prompt = system_prompt
+    @staticmethod
+    def from_model_manager(model_nameger: ModelManager):
+        model, model_path = model_nameger.fetch_model(
+            "qwen_prompt", require_model_path=True
+        )
+        system_prompt = """You are an English image describer. Here are some example image styles:\n\n1. Extreme close-up: Clear focus on a single object with a blurred background, highlighted under natural sunlight.\n2. Vintage: A photograph of a historical scene, using techniques such as Daguerreotype or cyanotype.\n3. Anime: A stylized cartoon image, emphasizing hyper-realistic portraits and luminous brushwork.\n4. Candid: A natural, unposed shot capturing spontaneous moments, often with cinematic qualities.\n5. Landscape: A photorealistic image of natural scenery, such as a sunrise over the sea.\n6. Design: Colorful and detailed illustrations, often in the style of 2D game art or botanical illustrations.\n7. Urban: An ultrarealistic scene in a modern setting, possibly a cityscape viewed from indoors.\n\nYour task is to translate a given Chinese image description into a concise and precise English description. Ensure that the imagery is vivid and descriptive, and include stylistic elements to enrich the description.\nPlease note the following points:\n\n1. Capture the essence and mood of the Chinese description without including direct phrases or words from the examples provided.\n2. You should add appropriate words to make the images described in the prompt more aesthetically pleasing. If the Chinese description does not specify a style, you need to add some stylistic descriptions based on the essence of the Chinese text.\n3. The generated English description should not exceed 200 words.\n\n"""
+        qwen_prompt = QwenPrompt(
+            tokenizer_path=model_path, model=model, system_prompt=system_prompt
+        )
+        return qwen_prompt
+    def __call__(self, raw_prompt, positive=True, **kwargs):
+        if positive:
+            messages = [
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": raw_prompt},
+            ]
+            text = self.tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            model_inputs = self.tokenizer([text], return_tensors="pt").to(
+                self.model.device
+            )
+            generated_ids = self.model.generate(
+                model_inputs.input_ids, max_new_tokens=512
+            )
+            generated_ids = [
+                output_ids[len(input_ids) :]
+                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+            prompt = self.tokenizer.batch_decode(
+                generated_ids, skip_special_tokens=True
+            )[0]
+            print(f"Your prompt is refined by Qwen: {prompt}")
+            return prompt
+        else:
+            return raw_prompt
+class Translator(torch.nn.Module):
+    def __init__(self, tokenizer_path=None, model=None):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model = model
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager):
+        model, model_path = model_manager.fetch_model(
+            "translator", require_model_path=True
+        )
+        translator = Translator(tokenizer_path=model_path, model=model)
+        return translator
+    def __call__(self, prompt, **kwargs):
+        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
+            self.model.device
+        )
+        output_ids = self.model.generate(input_ids)
+        prompt = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        print(f"Your prompt is translated: {prompt}")
+        return prompt

prompters/wan_prompter.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from .base_prompter import BasePrompter
+from models.wan_video_text_encoder import WanTextEncoder
+from transformers import AutoTokenizer
+import os, torch
+import ftfy
+import html
+import string
+import regex as re
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, "whitespace", "lower", "canonicalize")
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop("return_mask", False)
+        # arguments
+        _kwargs = {"return_tensors": "pt"}
+        if self.seq_len is not None:
+            _kwargs.update(
+                {
+                    "padding": "max_length",
+                    "truncation": True,
+                    "max_length": self.seq_len,
+                }
+            )
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == "whitespace":
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == "lower":
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == "canonicalize":
+            text = canonicalize(basic_clean(text))
+        return text
+class WanPrompter(BasePrompter):
+    def __init__(self, tokenizer_path=None, text_len=512):
+        super().__init__()
+        self.text_len = text_len
+        self.text_encoder = None
+        self.fetch_tokenizer(tokenizer_path)
+    def fetch_tokenizer(self, tokenizer_path=None):
+        if tokenizer_path is not None:
+            self.tokenizer = HuggingfaceTokenizer(
+                name=tokenizer_path, seq_len=self.text_len, clean="whitespace"
+            )
+    def fetch_models(self, text_encoder: WanTextEncoder = None):
+        self.text_encoder = text_encoder
+    def encode_prompt(self, prompt, positive=True, device="cuda"):
+        prompt = self.process_prompt(prompt, positive=positive)
+        ids, mask = self.tokenizer(prompt, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_emb = self.text_encoder(ids, mask)
+        for i, v in enumerate(seq_lens):
+            prompt_emb[:, v:] = 0
+        return prompt_emb

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+torch==2.7.0
+torchvision==0.22.0
+ftfy==6.3.1
+huggingface_hub==0.31.1
+imageio==2.37.0
+insightface==0.7.3
+numpy==2.2.6
+opencv_python==4.11.0.86
+Pillow==11.3.0
+safetensors==0.5.3
+tqdm==4.67.1
+transformers==4.46.2
+facexlib==0.3.0
+einops==0.8.1
+onnxruntime-gpu==1.22.0
+imageio-ffmpeg==0.6.0
+scikit-image==0.25.2

schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .ddim import EnhancedDDIMScheduler
+from .continuous_ode import ContinuousODEScheduler
+from .flow_match import FlowMatchScheduler

schedulers/continuous_ode.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+class ContinuousODEScheduler:
+    def __init__(
+        self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0
+    ):
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.rho = rho
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, **kwargs):
+        ramp = torch.linspace(1 - denoising_strength, 1, num_inference_steps)
+        min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
+        max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
+        self.sigmas = torch.pow(
+            max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho
+        )
+        self.timesteps = torch.log(self.sigmas) * 0.25
+    def step(self, model_output, timestep, sample, to_final=False):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample *= (sigma * sigma + 1).sqrt()
+        estimated_sample = (
+            -sigma / (sigma * sigma + 1).sqrt() * model_output
+            + 1 / (sigma * sigma + 1) * sample
+        )
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            prev_sample = estimated_sample
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+            derivative = 1 / sigma * (sample - estimated_sample)
+            prev_sample = sample + derivative * (sigma_ - sigma)
+            prev_sample /= (sigma_ * sigma_ + 1).sqrt()
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        # This scheduler doesn't support this function.
+        pass
+    def add_noise(self, original_samples, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (original_samples + noise * sigma) / (sigma * sigma + 1).sqrt()
+        return sample
+    def training_target(self, sample, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        target = (
+            -(sigma * sigma + 1).sqrt() / sigma + 1 / (sigma * sigma + 1).sqrt() / sigma
+        ) * sample + 1 / (sigma * sigma + 1).sqrt() * noise
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        weight = (1 + sigma * sigma).sqrt() / sigma
+        return weight

schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch, math
+class EnhancedDDIMScheduler:
+    def __init__(
+        self,
+        num_train_timesteps=1000,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        prediction_type="epsilon",
+        rescale_zero_terminal_snr=False,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        if beta_schedule == "scaled_linear":
+            betas = torch.square(
+                torch.linspace(
+                    math.sqrt(beta_start),
+                    math.sqrt(beta_end),
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+            )
+        elif beta_schedule == "linear":
+            betas = torch.linspace(
+                beta_start, beta_end, num_train_timesteps, dtype=torch.float32
+            )
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented")
+        self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        if rescale_zero_terminal_snr:
+            self.alphas_cumprod = self.rescale_zero_terminal_snr(self.alphas_cumprod)
+        self.alphas_cumprod = self.alphas_cumprod.tolist()
+        self.set_timesteps(10)
+        self.prediction_type = prediction_type
+    def rescale_zero_terminal_snr(self, alphas_cumprod):
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so the last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so the first timestep is back to the old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt.square()  # Revert sqrt
+        return alphas_bar
+    def set_timesteps(self, num_inference_steps, denoising_strength=1.0, **kwargs):
+        # The timesteps are aligned to 999...0, which is different from other implementations,
+        # but I think this implementation is more reasonable in theory.
+        max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0)
+        num_inference_steps = min(num_inference_steps, max_timestep + 1)
+        if num_inference_steps == 1:
+            self.timesteps = torch.Tensor([max_timestep])
+        else:
+            step_length = max_timestep / (num_inference_steps - 1)
+            self.timesteps = torch.Tensor(
+                [
+                    round(max_timestep - i * step_length)
+                    for i in range(num_inference_steps)
+                ]
+            )
+    def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev):
+        if self.prediction_type == "epsilon":
+            weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(
+                alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t
+            )
+            weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t)
+            prev_sample = sample * weight_x + model_output * weight_e
+        elif self.prediction_type == "v_prediction":
+            weight_e = -math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t)) + math.sqrt(
+                alpha_prod_t * (1 - alpha_prod_t_prev)
+            )
+            weight_x = math.sqrt(alpha_prod_t * alpha_prod_t_prev) + math.sqrt(
+                (1 - alpha_prod_t) * (1 - alpha_prod_t_prev)
+            )
+            prev_sample = sample * weight_x + model_output * weight_e
+        else:
+            raise NotImplementedError(f"{self.prediction_type} is not implemented")
+        return prev_sample
+    def step(self, model_output, timestep, sample, to_final=False):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            alpha_prod_t_prev = 1.0
+        else:
+            timestep_prev = int(self.timesteps[timestep_id + 1])
+            alpha_prod_t_prev = self.alphas_cumprod[timestep_prev]
+        return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev)
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(
+            1 - alpha_prod_t
+        )
+        return noise_pred
+    def add_noise(self, original_samples, noise, timestep):
+        sqrt_alpha_prod = math.sqrt(
+            self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        )
+        sqrt_one_minus_alpha_prod = math.sqrt(
+            1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        )
+        noisy_samples = (
+            sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        )
+        return noisy_samples
+    def training_target(self, sample, noise, timestep):
+        if self.prediction_type == "epsilon":
+            return noise
+        else:
+            sqrt_alpha_prod = math.sqrt(
+                self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+            )
+            sqrt_one_minus_alpha_prod = math.sqrt(
+                1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+            )
+            target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+            return target
+    def training_weight(self, timestep):
+        return 1.0

schedulers/flow_match.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+class FlowMatchScheduler:
+    def __init__(
+        self,
+        num_inference_steps=100,
+        num_train_timesteps=1000,
+        shift=3.0,
+        sigma_max=1.0,
+        sigma_min=0.003 / 1.002,
+        inverse_timesteps=False,
+        extra_one_step=False,
+        reverse_sigmas=False,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(
+        self,
+        num_inference_steps=100,
+        denoising_strength=1.0,
+        training=False,
+        shift=None,
+    ):
+        if shift is not None:
+            self.shift = shift
+        sigma_start = (
+            self.sigma_min + (self.sigma_max - self.sigma_min) * denoising_strength
+        )
+        if self.extra_one_step:
+            self.sigmas = torch.linspace(
+                sigma_start, self.sigma_min, num_inference_steps + 1
+            )[:-1]
+        else:
+            self.sigmas = torch.linspace(
+                sigma_start, self.sigma_min, num_inference_steps
+            )
+        if self.inverse_timesteps:
+            self.sigmas = torch.flip(self.sigmas, dims=[0])
+        self.sigmas = self.shift * self.sigmas / (1 + (self.shift - 1) * self.sigmas)
+        if self.reverse_sigmas:
+            self.sigmas = 1 - self.sigmas
+        self.timesteps = self.sigmas * self.num_train_timesteps
+        if training:
+            x = self.timesteps
+            y = torch.exp(
+                -2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2
+            )
+            y_shifted = y - y.min()
+            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
+            self.linear_timesteps_weights = bsmntw_weighing
+            self.training = True
+        else:
+            self.training = False
+    def step(self, model_output, timestep, sample, to_final=False, **kwargs):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 1 if (self.inverse_timesteps or self.reverse_sigmas) else 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin(
+            (self.timesteps - timestep.to(self.timesteps.device)).abs()
+        )
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

test/input/first_frame.png ADDED Viewed

Git LFS Details

SHA256: 1f864b3330b1b47f11cc71235993766a34187a107b01815bf2708f4a459ffa67
Pointer size: 131 Bytes
Size of remote file: 403 kB

test/input/lecun.jpg ADDED Viewed

test/input/pose.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2038896160fb990162832cbff6eaebcf05b25e1a3b8c201e5b147a4ce3ce01d
+size 173260

test/input/ruonan.jpg ADDED Viewed

Git LFS Details

SHA256: d0f82d2b7c91c08033ca2ce14d475675ccd302a966a6abad4f028568a5d078d2
Pointer size: 131 Bytes
Size of remote file: 204 kB

test/input/woman.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee8fb303f53a89c0ab36c0457c9452149c58a881055becb4da8abc41766bc6db
+size 8399484