Test

Paused

App Files Files Community

caarleexx commited on Oct 29

Commit

bbf5f69

verified ·

1 Parent(s): 007c224

Upload 16 files

Browse files

Files changed (8) hide show

Dockerfile +76 -75
app.py +148 -261
compose.yaml +31 -18
entrypoint.sh +36 -13
pipeline_ltx_video (3).py +1895 -0
requirements.txt +1 -1
setup.py +114 -120
start.sh +73 -34

Dockerfile CHANGED Viewed

@@ -1,126 +1,127 @@
 # =============================================================================
-# ADUC-SDR Video Suite — High-Perf Diffusers for 8× L40S (SM 8.9)
 # CUDA 12.8 | PyTorch 2.8.0+cu128 | Ubuntu 22.04
 # =============================================================================
 FROM nvidia/cuda:12.8.0-devel-ubuntu22.04
-LABEL maintainer="Carlos Rodrigues dos Santos & Development Partner"
-LABEL description="High-performance Diffusers stack with FA2/SDPA, 8×L40S"
-LABEL version="4.4.0"
 LABEL cuda_version="12.8.0"
 LABEL python_version="3.10"
 LABEL pytorch_version="2.8.0+cu128"
 LABEL gpu_optimized_for="8x_NVIDIA_L40S"
-# ---------------- Core env & caches ----------------
 ENV DEBIAN_FRONTEND=noninteractive TZ=UTC LANG=C.UTF-8 LC_ALL=C.UTF-8 \
     PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 \
-    PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
-# GPU/Compute
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV TORCH_CUDA_ARCH_LIST="8.9"
-ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
-ENV CUDA_DEVICE_MAX_CONNECTIONS=32
-# Threads
 ENV OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 MAX_JOBS=160
-# Alloc/caches
-ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512,garbage_collection_threshold:0.8
-ENV CUDA_LAUNCH_BLOCKING=0 CUDA_CACHE_MAXSIZE=2147483648 CUDA_CACHE_DISABLE=0
-# App paths
-ENV APP_HOME=/app
-WORKDIR $APP_HOME
-# Persistent data and caches in /data
-ENV HF_HOME=/data/.cache/huggingface
-ENV TORCH_HOME=/data/.cache/torch
-ENV HF_DATASETS_CACHE=/data/.cache/datasets
-ENV TRANSFORMERS_CACHE=/data/.cache/transformers
-ENV DIFFUSERS_CACHE=/data/.cache/diffusers
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-ENV TOKENIZERS_PARALLELISM=false
-# Create non-root user and data dirs early, fix ownership
 RUN useradd -m -u 1000 -s /bin/bash appuser && \
-    mkdir -p /data /data/models \
-             /data/.cache/huggingface /data/.cache/torch \
-             /data/.cache/datasets /data/.cache/transformers /data/.cache/diffusers && \
-    chown -R appuser:appuser /data
-# Models live in /data/models and are visible at /app/models
-ENV MODELS_DIR=/data/models
-RUN ln -sf /data/models /app/models
-# ---------------- System & Python ----------------
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential gosu tree cmake git git-lfs curl wget ffmpeg ninja-build \
     python3.10 python3.10-dev python3.10-distutils python3-pip \
     ca-certificates libglib2.0-0 libgl1 \
  && apt-get clean && rm -rf /var/lib/apt/lists/*
-RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
-    ln -sf /usr/bin/python3.10 /usr/bin/python && \
     python3 -m pip install --upgrade pip
-# ---------------- PyTorch cu128 (pinned) ----------------
 RUN pip install --index-url https://download.pytorch.org/whl/cu128 \
     torch>=2.8.0+cu128 torchvision>=0.23.0+cu128 torchaudio>=2.8.0+cu128
-# ---------------- Toolchain, Triton, FA2 (no bnb build) ----------------
 RUN pip install packaging ninja cmake pybind11 scikit-build cython hf_transfer "numpy>=1.24.4"
-# Triton 3.x (no triton.ops)
 RUN pip uninstall -y triton || true && \
     pip install -v --no-build-isolation triton==3.4.0
-# FlashAttention 2.8.x
-RUN pip install flash-attn==2.8.3 --no-build-isolation || \
-    pip install flash-attn==2.8.2 --no-build-isolation || \
-    pip install flash-attn==2.8.1 --no-build-isolation || \
-    pip install flash-attn==2.8.0.post2 --no-build-isolation
-# ---------------- App dependencies ----------------
-COPY requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
-# Pin bnb to avoid surprise CUDA/PTX mismatches (adjust as needed)
 RUN pip install --upgrade bitsandbytes
-# Custom .whl (Apex + dropout_layer_norm)
 RUN echo "Installing custom wheels..." && \
     pip install --no-cache-dir \
       "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/apex-0.1-cp310-cp310-linux_x86_64.whl" \
       "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/dropout_layer_norm-0.1-cp310-cp310-linux_x86_64.whl"
-# ====================================================================
-# Optional: q8_kernels + LTX-Video (enable if needed; ensure wheel ABI)
-RUN pip install --no-cache-dir \
-   "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/q8_kernels-0.0.5-cp310-cp310-linux_x86_64.whl"
-# RUN git clone https://github.com/Lightricks/LTX-Video.git /data/LTX-Video && \
-#     cd /data/LTX-Video && python -m pip install -e .[inference]
-# ====================================================================
-# Scripts and app
-COPY info.sh ./app/info.sh
-COPY builder.sh ./app/builder.sh
-COPY start.sh ./app/start.sh
-COPY entrypoint.sh ./app/entrypoint.sh
-# Copy the rest of the source last for better caching
-COPY . .
-# Permissions on app tree
-RUN chown -R appuser:appuser /app /data && \
-    chmod 0755 /app/entrypoint.sh /app/start.sh /app/info.sh /app/builder.sh
 VOLUME /data
-ENTRYPOINT ["/app/entrypoint.sh"]
 USER appuser
-# ---------------- Entry ----------------
-CMD ["/app/start.sh"]

 # =============================================================================
+# ADUC-SDR Video Suite — Dockerfile Otimizado
+# Preserva a estrutura de instalação original para alta performance.
 # CUDA 12.8 | PyTorch 2.8.0+cu128 | Ubuntu 22.04
 # =============================================================================
 FROM nvidia/cuda:12.8.0-devel-ubuntu22.04
+LABEL maintainer="Carlos Rodrigues dos Santos"
+LABEL description="ADUC-SDR: High-performance Diffusers stack for 8x NVIDIA L40S with LTX-Video and SeedVR"
+LABEL version="5.0.0"
 LABEL cuda_version="12.8.0"
 LABEL python_version="3.10"
 LABEL pytorch_version="2.8.0+cu128"
 LABEL gpu_optimized_for="8x_NVIDIA_L40S"
+# =============================================================================
+# 1. Variáveis de Ambiente e Configuração de Paths
+# =============================================================================
 ENV DEBIAN_FRONTEND=noninteractive TZ=UTC LANG=C.UTF-8 LC_ALL=C.UTF-8 \
     PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=0 PIP_DISABLE_PIP_VERSION_CHECK=1
+# --- Configurações de GPU e Computação ---
+ENV NVIDIA_VISIBLE_DEVICES=all \
+    TORCH_CUDA_ARCH_LIST="8.9" \
+    CUDA_DEVICE_ORDER=PCI_BUS_ID \
+    CUDA_DEVICE_MAX_CONNECTIONS=32
+# --- Configurações de Threads ---
 ENV OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 MAX_JOBS=160
+# --- Configurações de Alocador de Memória e Caches de GPU ---
+ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512,garbage_collection_threshold:0.8 \
+    CUDA_LAUNCH_BLOCKING=0 CUDA_CACHE_MAXSIZE=2147483648 CUDA_CACHE_DISABLE=0
+# --- Paths da Aplicação e Dados Persistentes ---
+ENV APP_HOME=/app \
+    HF_HOME=/data/.cache/huggingface \
+    TORCH_HOME=/data/.cache/torch \
+    HF_DATASETS_CACHE=/data/.cache/datasets \
+    TRANSFORMERS_CACHE=/data/.cache/transformers \
+    DIFFUSERS_CACHE=/data/.cache/diffusers \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    TOKENIZERS_PARALLELISM=false
+WORKDIR $APP_HOME
+# =============================================================================
+# 2. Setup de Usuário e Sistema
+# =============================================================================
+# Cria usuário não-root e diretórios de dados/app.
+# As permissões finais serão aplicadas no final.
 RUN useradd -m -u 1000 -s /bin/bash appuser && \
+    mkdir -p /data $APP_HOME /app/output
+# --- Instalação de Pacotes de Sistema e Python ---
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential gosu tree cmake git git-lfs curl wget ffmpeg ninja-build \
     python3.10 python3.10-dev python3.10-distutils python3-pip \
     ca-certificates libglib2.0-0 libgl1 \
  && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
     python3 -m pip install --upgrade pip
+# =============================================================================
+# 3. Instalação da Toolchain de Machine Learning (Mantida 100% Original)
+# =============================================================================
+# --- PyTorch para CUDA 12.8 ---
 RUN pip install --index-url https://download.pytorch.org/whl/cu128 \
     torch>=2.8.0+cu128 torchvision>=0.23.0+cu128 torchaudio>=2.8.0+cu128
+# --- Ferramentas de Compilação, Triton e FlashAttention ---
 RUN pip install packaging ninja cmake pybind11 scikit-build cython hf_transfer "numpy>=1.24.4"
+# --- Triton 3.x ---
 RUN pip uninstall -y triton || true && \
     pip install -v --no-build-isolation triton==3.4.0
+# --- FlashAttention 2.8.x ---
+# =============================================================================
+# 4. Instalação das Dependências da Aplicação
+# =============================================================================
+# Copia e instala requirements.txt primeiro para otimizar o cache de camadas do Docker.
+COPY --chown=appuser:appuser requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
+# --- Instalação de bitsandbytes e Wheels Customizados (Mantido 100% Original) ---
 RUN pip install --upgrade bitsandbytes
+# Instala wheels customizados (Apex, etc.)
+# Instala q8_kernels
+RUN pip install --no-cache-dir \
+   "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/q8_kernels-0.0.5-cp310-cp310-linux_x86_64.whl"
 RUN echo "Installing custom wheels..." && \
     pip install --no-cache-dir \
       "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/apex-0.1-cp310-cp310-linux_x86_64.whl" \
       "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/dropout_layer_norm-0.1-cp310-cp310-linux_x86_64.whl"
+# =============================================================================
+# 5. Cópia do Código-Fonte e Configuração Final
+# =============================================================================
+# Copia o restante do código-fonte da aplicação por último.
+COPY --chown=appuser:appuser . .
+# Garante que todos os scripts de inicialização sejam executáveis
+# e que o usuário 'appuser' seja o dono de todos os arquivos.
+RUN chown -R appuser:appuser $APP_HOME /data && \
+    chmod +x /app/entrypoint.sh /app/start.sh /app/info.sh /app/builder.sh
+# =============================================================================
+# 6. Ponto de Entrada
+# =============================================================================
+# Expõe o diretório /data para ser montado como um volume persistente.
 VOLUME /data
+# Define o usuário padrão para a execução do contêiner.
 USER appuser
+# Define o script que será executado na inicialização do contêiner.
+ENTRYPOINT ["/app/entrypoint.sh"]
+# Define o comando padrão a ser executado pelo entrypoint.
+CMD ["/app/start.sh"]

app.py CHANGED Viewed

@@ -1,307 +1,194 @@
-# FILE: app_complete.py
-# DESCRIPTION: Gradio web interface for the LTX-Video generation service.
-#              Provides a user-friendly, step-by-step workflow for creating videos.
 import gradio as gr
-import traceback
 import sys
-# ==============================================================================
-# --- BACKEND SERVICES IMPORT ---
-# ==============================================================================
-# Encapsulate imports in a try-except block for robust error handling at startup.
-try:
-    # This assumes the backend file is named 'ltx_server_refactored_complete.py'
-    # and is in a reachable path (e.g., 'api/').
-    from api.ltx_server_refactored_complete import video_generation_service
-    # Placeholder for SeedVR server.
-    # from api.seedvr_server import SeedVRServer
-    # seedvr_inference_server = SeedVRServer()
-    seedvr_inference_server = None
-    print("Backend services imported successfully.")
-except ImportError as e:
-    print(f"FATAL ERROR: Could not import backend services. Ensure the backend file is accessible. Details: {e}")
-    sys.exit(1)
-except Exception as e:
-    print(f"FATAL ERROR: An unexpected error occurred during backend initialization. Details: {e}")
-    sys.exit(1)
-# ==============================================================================
-# --- UI WRAPPER FUNCTIONS ---
-# These functions act as a bridge between the Gradio UI and the backend service.
-# They handle data conversion, error catching, and UI updates.
-# ==============================================================================
-def run_generate_base_video(
-    generation_mode: str, prompt: str, neg_prompt: str, start_img: str,
-    height: int, width: int, duration: float, seed: int, randomize_seed: bool,
-    fp_guidance_preset: str, fp_guidance_scale_list: str, fp_stg_scale_list: str,
-    progress=gr.Progress(track_tqdm=True)
-) -> tuple:
-    """
-    Wrapper to call the backend for generating the initial low-resolution video.
-    It decides whether to use the 'narrative' or 'single' generation mode.
-    """
-    try:
-        print(f"[UI] Request received for base video generation. Mode: {generation_mode}")
-        initial_conditions = []
         if start_img:
-            # Estimate total frames for conditioning context
             num_frames_estimate = int(duration * 24)
-            items_list = [[start_img, 0, 1.0]] # [[media, frame, weight]]
-            initial_conditions = video_generation_service.prepare_condition_items(
-                items_list, height, width, num_frames_estimate
-            )
-        # Package advanced LTX settings for the backend
-        ltx_configs = {
-            "guidance_preset": fp_guidance_preset,
-            "guidance_scale_list": fp_guidance_scale_list,
-            "stg_scale_list": fp_stg_scale_list,
-        }
-        # Select the appropriate backend function based on UI mode
-        if generation_mode == "Narrativa (Múltiplos Prompts)":
-            func_to_call = video_generation_service.generate_narrative_low
-        else:
-            func_to_call = video_generation_service.generate_single_low
-        video_path, tensor_path, final_seed = func_to_call(
             prompt=prompt, negative_prompt=neg_prompt,
-            height=height, width=width, duration=duration,
-            seed=None if randomize_seed else int(seed),
-            initial_conditions=initial_conditions,
-            ltx_configs_override=ltx_configs
         )
-        if not video_path:
-            raise RuntimeError("Backend failed to return a valid video path.")
-        # Update the session state with the results
-        new_state = {"low_res_video": video_path, "low_res_latents": tensor_path, "used_seed": final_seed}
-        print(f"[UI] Base video generation successful. Path: {video_path}")
         return video_path, new_state, gr.update(visible=True)
-    except Exception as e:
-        error_message = f"❌ An error occurred during base generation:\n{e}"
-        print(f"{error_message}\nDetails: {traceback.format_exc()}")
-        raise gr.Error(error_message)
-def run_ltx_refinement(
-    state: dict, prompt: str, neg_prompt: str,
-    progress=gr.Progress(track_tqdm=True)
-) -> tuple:
-    """Wrapper to call the LTX texture refinement and upscaling backend function."""
-    if not state or not state.get("low_res_latents"):
-        raise gr.Error("Error: Please generate a base video in Step 1 before refining.")
-    try:
-        print("[UI] Request received for LTX refinement.")
         video_path, tensor_path = video_generation_service.generate_upscale_denoise(
             latents_path=state["low_res_latents"],
             prompt=prompt,
             negative_prompt=neg_prompt,
             seed=state["used_seed"]
         )
-        # Update state with refined assets
         state["refined_video_ltx"] = video_path
         state["refined_latents_ltx"] = tensor_path
-        print(f"[UI] LTX refinement successful. Path: {video_path}")
         return video_path, state
-    except Exception as e:
-        error_message = f"❌ An error occurred during LTX Refinement:\n{e}"
-        print(f"{error_message}\nDetails: {traceback.format_exc()}")
-        raise gr.Error(error_message)
-def run_seedvr_upscaling(
-    state: dict, seed: int, resolution: int, batch_size: int, fps: int,
-    progress=gr.Progress(track_tqdm=True)
-) -> tuple:
-    """Wrapper to call the SeedVR upscaling backend service."""
-    if not state or not state.get("low_res_video"):
-        raise gr.Error("Error: Please generate a base video in Step 1 before upscaling.")
-    if not seedvr_inference_server:
-        raise gr.Error("Error: The SeedVR upscaling server is not available.")
-    try:
-        print("[UI] Request received for SeedVR upscaling.")
-        def progress_wrapper(p, desc=""): progress(p, desc=desc)
         output_filepath = seedvr_inference_server.run_inference(
-            file_path=state["low_res_video"], seed=seed, resolution=resolution,
             batch_size=batch_size, fps=fps, progress=progress_wrapper
         )
-        status_message = f"✅ Upscaling complete!\nSaved to: {output_filepath}"
-        print(f"[UI] SeedVR upscaling successful. Path: {output_filepath}")
-        return gr.update(value=output_filepath), gr.update(value=status_message)
-    except Exception as e:
-        error_message = f"❌ An error occurred during SeedVR Upscaling:\n{e}"
-        print(f"{error_message}\nDetails: {traceback.format_exc()}")
-        return None, gr.update(value=error_message)
-# ==============================================================================
-# --- UI BUILDER ---
-# Functions dedicated to creating parts of the Gradio interface.
-# ==============================================================================
-def build_ui():
-    """Constructs the entire Gradio application UI."""
-    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
-        # App state persists across interactions within a session
-        app_state = gr.State(value={"low_res_video": None, "low_res_latents": None, "used_seed": None})
-        gr.Markdown("# LTX Video - Geração e Pós-Produção por Etapas", elem_id="main-title")
-        ui_components = {} # Dictionary to hold all key UI components
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Build the main generation controls (Step 1)
-                _build_generation_controls(ui_components)
-            with gr.Column(scale=1):
-                gr.Markdown("### Vídeo Base Gerado")
-                ui_components['low_res_video_output'] = gr.Video(
-                    label="O resultado da Etapa 1 aparecerá aqui", interactive=False
-                )
-        # Build the post-production section (Step 2), initially hidden
-        _build_postprod_controls(ui_components)
-        # Connect all UI events to their corresponding functions
-        _register_event_handlers(app_state, ui_components)
-    return demo
-def _build_generation_controls(ui: dict):
-    """Builds the UI components for Step 1: Base Video Generation."""
-    gr.Markdown("### Etapa 1: Configurações de Geração")
-    ui['generation_mode'] = gr.Radio(
-        label="Modo de Geração",
-        choices=["Simples (Prompt Único)", "Narrativa (Múltiplos Prompts)"],
-        value="Narrativa (Múltiplos Prompts)",
-        info="Simples para uma ação contínua, Narrativa para uma sequência de cenas (uma por linha)."
-    )
-    ui['prompt'] = gr.Textbox(label="Prompt(s)", value="Um leão majestoso caminha pela savana\nEle sobe em uma grande pedra e olha o horizonte", lines=4)
-    ui['neg_prompt'] = gr.Textbox(label="Negative Prompt", value="blurry, low quality, bad anatomy, deformed", lines=2)
-    ui['start_image'] = gr.Image(label="Imagem de Início (Opcional)", type="filepath", sources=["upload"])
-    with gr.Accordion("Parâmetros Principais", open=True):
-        ui['duration'] = gr.Slider(label="Duração Total (s)", value=4, step=1, minimum=1, maximum=30)
-        with gr.Row():
-            ui['height'] = gr.Slider(label="Height", value=432, step=16, minimum=256, maximum=1024)
-            ui['width'] = gr.Slider(label="Width", value=768, step=16, minimum=256, maximum=1024)
-        with gr.Row():
-            ui['seed'] = gr.Number(label="Seed", value=42, precision=0)
-            ui['randomize_seed'] = gr.Checkbox(label="Randomize Seed", value=True)
-    with gr.Accordion("Opções Avançadas de Guiagem (First Pass)", open=False):
-        ui['fp_guidance_preset'] = gr.Dropdown(
-            label="Preset de Guiagem",
-            choices=["Padrão (Recomendado)", "Agressivo", "Suave", "Customizado"],
-            value="Padrão (Recomendado)",
-            info="Controla como a guiagem de texto se comporta ao longo da difusão."
-        )
-        with gr.Group(visible=False) as ui['custom_guidance_group']:
-            gr.Markdown("⚠️ Edite as listas em formato JSON. Ex: `[1.0, 2.5, 3.0]`")
-            ui['fp_guidance_scale_list'] = gr.Textbox(label="Lista de Guidance Scale", value="[1, 1, 6, 8, 6, 1, 1]")
-            ui['fp_stg_scale_list'] = gr.Textbox(label="Lista de STG Scale (Movimento)", value="[0, 0, 4, 4, 4, 2, 1]")
-    ui['generate_low_btn'] = gr.Button("1. Gerar Vídeo Base", variant="primary")
-def _build_postprod_controls(ui: dict):
-    """Builds the UI components for Step 2: Post-Production."""
-    with gr.Group(visible=False) as ui['post_prod_group']:
-        gr.Markdown("--- \n## Etapa 2: Pós-Produção", elem_id="postprod-title")
         with gr.Tabs():
-            with gr.TabItem("🚀 Upscaler de Textura (LTX)"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                         gr.Markdown("Usa o prompt e a semente originais para refinar o vídeo, adicionando detalhes e texturas de alta qualidade.")
-                         ui['ltx_refine_btn'] = gr.Button("2. Aplicar Refinamento LTX", variant="primary")
                     with gr.Column(scale=1):
-                        ui['ltx_refined_video_output'] = gr.Video(label="Vídeo com Textura Refinada", interactive=False)
-            with gr.TabItem("✨ Upscaler de Resolução (SeedVR)"):
-                is_seedvr_available = seedvr_inference_server is not None
-                if not is_seedvr_available:
-                    gr.Markdown("🔴 *O serviço SeedVR não está disponível nesta instância.*")
                 with gr.Row():
                     with gr.Column(scale=1):
-                        ui['seedvr_seed'] = gr.Slider(minimum=0, maximum=999999, value=42, step=1, label="Seed")
-                        ui['seedvr_resolution'] = gr.Slider(minimum=720, maximum=1440, value=1072, step=8, label="Resolução Vertical")
-                        ui['seedvr_batch_size'] = gr.Slider(minimum=1, maximum=16, value=4, step=1, label="Batch Size por GPU")
-                        ui['seedvr_fps'] = gr.Number(label="FPS de Saída (0 = original)", value=0)
-                        ui['run_seedvr_btn'] = gr.Button("2. Iniciar Upscaling SeedVR", variant="primary", interactive=is_seedvr_available)
                     with gr.Column(scale=1):
-                        ui['seedvr_video_output'] = gr.Video(label="Vídeo com Upscale SeedVR", interactive=False)
-                        ui['seedvr_status_box'] = gr.Textbox(label="Status", value="Aguardando...", lines=3, interactive=False)
-# ==============================================================================
-# --- EVENT HANDLERS ---
-# Connects UI component events (like clicks) to the wrapper functions.
-# ==============================================================================
-def _register_event_handlers(app_state: gr.State, ui: dict):
-    """Registers all Gradio event handlers."""
-    # --- Handler for custom guidance visibility ---
-    def toggle_custom_guidance(preset_choice: str) -> gr.update:
-        return gr.update(visible=(preset_choice == "Customizado"))
-    ui['fp_guidance_preset'].change(
-        fn=toggle_custom_guidance,
-        inputs=ui['fp_guidance_preset'],
-        outputs=ui['custom_guidance_group']
     )
-    # --- Handler for the main "Generate" button ---
-    gen_inputs = [
-        ui['generation_mode'], ui['prompt'], ui['neg_prompt'], ui['start_image'],
-        ui['height'], ui['width'], ui['duration'], ui['seed'], ui['randomize_seed'],
-        ui['fp_guidance_preset'], ui['fp_guidance_scale_list'], ui['fp_stg_scale_list']
-    ]
-    gen_outputs = [
-        ui['low_res_video_output'], app_state, ui['post_prod_group']
-    ]
-    ui['generate_low_btn'].click(fn=run_generate_base_video, inputs=gen_inputs, outputs=gen_outputs)
-    # --- Handler for the LTX Refine button ---
-    refine_inputs = [app_state, ui['prompt'], ui['neg_prompt']]
-    refine_outputs = [ui['ltx_refined_video_output'], app_state]
-    ui['ltx_refine_btn'].click(fn=run_ltx_refinement, inputs=refine_inputs, outputs=refine_outputs)
-    # --- Handler for the SeedVR Upscale button ---
-    if 'run_seedvr_btn' in ui:
-        seedvr_inputs = [app_state, ui['seedvr_seed'], ui['seedvr_resolution'], ui['seedvr_batch_size'], ui['seedvr_fps']]
-        seedvr_outputs = [ui['seedvr_video_output'], ui['seedvr_status_box']]
-        ui['run_seedvr_btn'].click(fn=run_seedvr_upscaling, inputs=seedvr_inputs, outputs=seedvr_outputs)
-# ==============================================================================
-# --- APPLICATION ENTRY POINT ---
-# ==============================================================================
 if __name__ == "__main__":
-    print("Building Gradio UI...")
-    gradio_app = build_ui()
-    print("Launching Gradio app...")
-    gradio_app.queue().launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        debug=True,
-        show_error=True
-    )

+# app_refactored_with_postprod.py (FINAL VERSION with LTX Refinement)
 import gradio as gr
+import os
 import sys
+import traceback
+from pathlib import Path
+import torch
+import numpy as np
+from PIL import Image
+# --- Import dos Serviços de Backend ---
+# Serviço LTX para geração de vídeo base e refinamento de textura
+from api.ltx_server_refactored import video_generation_service
+# Serviço SeedVR para upscaling de alta qualidade
+from api.seedvr_server import SeedVRServer
+# Inicializa o servidor SeedVR uma vez, se disponível
+seedvr_inference_server = SeedVRServer() if SeedVRServer else None
+# --- ESTADO DA SESSÃO ---
+def create_initial_state():
+    return {
+        "low_res_video": None,
+        "low_res_latents": None,
+        "refined_video_ltx": None,
+        "refined_latents_ltx": None,
+        "used_seed": None
+    }
+# --- FUNÇÕES WRAPPER PARA A UI ---
+def run_generate_low(prompt, neg_prompt, start_img, height, width, duration, cfg, seed, randomize_seed, progress=gr.Progress(track_tqdm=True)):
+    """Executa a primeira etapa: geração de um vídeo base em baixa resolução."""
+    print("UI: Chamando generate_low")
+    if True:
+        conditioning_items = []
         if start_img:
             num_frames_estimate = int(duration * 24)
+            items_list = [[start_img, 0, 1.0]]
+            conditioning_items = video_generation_service._prepare_condition_items(items_list, height, width, num_frames_estimate)
+        used_seed = None if randomize_seed else seed
+        video_path, tensor_path, final_seed = video_generation_service.generate_low_resolution(
             prompt=prompt, negative_prompt=neg_prompt,
+            height=height, width=width, duration_secs=duration,
+            guidance_scale=cfg, seed=used_seed,
+            conditioning_items=conditioning_items
         )
+        new_state = {
+            "low_res_video": video_path,
+            "low_res_latents": tensor_path,
+            "refined_video_ltx": None,
+            "refined_latents_ltx": None,
+            "used_seed": final_seed
+        }
         return video_path, new_state, gr.update(visible=True)
+def run_ltx_refinement(state, prompt, neg_prompt, cfg, progress=gr.Progress(track_tqdm=True)):
+    """Executa o processo de refinamento e upscaling de textura com o pipeline LTX."""
+    print("UI: Chamando run_ltx_refinement (generate_upscale_denoise)")
+    if True:
         video_path, tensor_path = video_generation_service.generate_upscale_denoise(
             latents_path=state["low_res_latents"],
             prompt=prompt,
             negative_prompt=neg_prompt,
+            guidance_scale=cfg,
             seed=state["used_seed"]
         )
+        # Atualiza o estado com os novos artefatos refinados
         state["refined_video_ltx"] = video_path
         state["refined_latents_ltx"] = tensor_path
         return video_path, state
+def run_seedvr_upscaling(state, seed, resolution, batch_size, fps, progress=gr.Progress(track_tqdm=True)):
+    """Executa o processo de upscaling com SeedVR."""
+    video_path = state["low_res_video"]
+    print(f"▶️  Iniciando processo de upscaling SeedVR para o vídeo: {video_path}")
+    if True:
+        def progress_wrapper(p, desc=""):
+            progress(p, desc=desc)
         output_filepath = seedvr_inference_server.run_inference(
+            file_path=video_path, seed=seed, resolution=resolution,
             batch_size=batch_size, fps=fps, progress=progress_wrapper
         )
+        final_message = f"✅ Processo SeedVR concluído!\nVídeo salvo em: {output_filepath}"
+        return gr.update(value=output_filepath, interactive=True), gr.update(value=final_message, interactive=False)
+# --- DEFINIÇÃO DA INTERFACE GRADIO ---
+with gr.Blocks() as demo:
+    gr.Markdown("# LTX Video - Geração e Pós-Produção por Etapas")
+    app_state = gr.State(value=create_initial_state())
+    # --- ETAPA 1: Geração Base ---
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Etapa 1: Configurações de Geração")
+            prompt_input = gr.Textbox(label="Prompt", value="A majestic dragon flying over a medieval castle", lines=3)
+            neg_prompt_input = gr.Textbox(visible=False, label="Negative Prompt", value="worst quality, blurry, low quality, jittery", lines=2)
+            start_image = gr.Image(label="Imagem de Início (Opcional)", type="filepath", sources=["upload", "clipboard"])
+            with gr.Accordion("Parâmetros Avançados", open=False):
+                height_input = gr.Slider(label="Height", value=512, step=64, minimum=256, maximum=1024)
+                width_input = gr.Slider(label="Width", value=512, step=64, minimum=256, maximum=1024)
+                duration_input = gr.Slider(label="Duração (s)", value=8, step=0.5, minimum=1, maximum=16)
+                cfg_input = gr.Slider(label="Guidance Scale (CFG)", value=5.0, step=1, minimum=1, maximum=10.0)
+                seed_input = gr.Number(label="Seed", value=42, precision=0)
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+            generate_low_btn = gr.Button("1. Gerar Vídeo Base (Low-Res)", variant="primary")
+        with gr.Column(scale=1):
+            gr.Markdown("### Vídeo Base Gerado")
+            low_res_video_output = gr.Video(label="O resultado da Etapa 1 aparecerá aqui", interactive=False)
+    # --- ETAPA 2: Pós-Produção (no rodapé, em abas) ---
+    with gr.Group(visible=False) as post_prod_group:
+        gr.Markdown("<hr style='margin-top: 20px; margin-bottom: 20px;'>")
+        gr.Markdown("## Etapa 2: Pós-Produção")
+        gr.Markdown("Use o vídeo gerado acima como entrada para as ferramentas abaixo. **O prompt e a CFG da Etapa 1 serão reutilizados.**")
         with gr.Tabs():
+            # --- ABA LTX REFINEMENT (AGORA FUNCIONAL) ---
+            with gr.TabItem("🚀 Upscaler Textura (LTX)"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                         gr.Markdown("### Parâmetros de Refinamento")
+                         gr.Markdown("Esta etapa reutiliza o prompt, o prompt negativo e a CFG da Etapa 1 para manter a consistência.")
+                         ltx_refine_btn = gr.Button("Aplicar Refinamento de Textura LTX", variant="primary")
                     with gr.Column(scale=1):
+                        gr.Markdown("### Resultado do Refinamento")
+                        ltx_refined_video_output = gr.Video(label="Vídeo com Textura Refinada (LTX)", interactive=False)
+            # --- ABA SEEDVR UPSCALER ---
+            with gr.TabItem("✨ Upscaler SeedVR"):
                 with gr.Row():
                     with gr.Column(scale=1):
+                        gr.Markdown("### Parâmetros do SeedVR")
+                        seedvr_seed = gr.Slider(minimum=0, maximum=999999, value=42, step=1, label="Seed")
+                        seedvr_resolution = gr.Slider(minimum=720, maximum=1440, value=1072, step=8, label="Resolução Vertical (Altura)")
+                        seedvr_batch_size = gr.Slider(minimum=1, maximum=16, value=4, step=1, label="Batch Size por GPU")
+                        seedvr_fps_output = gr.Number(label="FPS de Saída (0 = original)", value=0)
+                        run_seedvr_button = gr.Button("Iniciar Upscaling SeedVR", variant="primary", interactive=(seedvr_inference_server is not None))
+                        if not seedvr_inference_server:
+                             gr.Markdown("<p style='color: red;'>Serviço SeedVR não disponível.</p>")
                     with gr.Column(scale=1):
+                        gr.Markdown("### Resultado do Upscaling")
+                        seedvr_video_output = gr.Video(label="Vídeo com Upscale SeedVR", interactive=False)
+                        seedvr_status_box = gr.Textbox(label="Status do Processamento", value="Aguardando...", lines=3, interactive=False)
+            # --- ABA MM-AUDIO ---
+            with gr.TabItem("🔊 Áudio (MM-Audio)"):
+                gr.Markdown("*(Funcionalidade futura para adicionar som aos vídeos)*")
+    # --- LÓGICA DE EVENTOS DA UI ---
+    # Botão da Etapa 1
+    generate_low_btn.click(
+        fn=run_generate_low,
+        inputs=[prompt_input, neg_prompt_input, start_image, height_input, width_input, duration_input, cfg_input, seed_input, randomize_seed],
+        outputs=[low_res_video_output, app_state, post_prod_group]
     )
+    # Botão da Aba LTX Refinement
+    ltx_refine_btn.click(
+        fn=run_ltx_refinement,
+        inputs=[app_state, prompt_input, neg_prompt_input, cfg_input],
+        outputs=[ltx_refined_video_output, app_state]
+    )
+    # Botão da Aba SeedVR
+    run_seedvr_button.click(
+        fn=run_seedvr_upscaling,
+        inputs=[app_state, seedvr_seed, seedvr_resolution, seedvr_batch_size, seedvr_fps_output],
+        outputs=[seedvr_video_output, seedvr_status_box]
+    )
 if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860, debug=True, show_error=True)

compose.yaml CHANGED Viewed

@@ -1,26 +1,39 @@
 services:
-  vincie:
-    image: img2img:edit
     deploy:
       resources:
         reservations:
           devices:
-            - capabilities: [gpu]
     ports:
-      - "7860:7860"
-    environment:
-      GRADIO_SERVER_PORT: "7860"
-      HF_HUB_CACHE: "/data/.cache/huggingface/hub"
-      CKPT_ROOT: "/data/ckpt/VINCIE-3B"
-      VINCIE_ROOT: "/data/VINCIE"
     volumes:
-      - vincie_hub:/data/.cache/huggingface/hub
-      - vincie_ckpt:/data/ckpt/VINCIE-3B
-      - vincie_out:/app/outputs
-      - vincie_repo:/data/VINCIE
 volumes:
-  vincie_hub: {}
-  vincie_ckpt: {}
-  vincie_out: {}
-  vincie_repo: {}

+# compose.yaml (Versão com VINCIE)
+version: '3.8'
 services:
+  aduc-sdr-app:
+    build: .
+    environment:
+      ADUC_LOG_LEVEL: "DEBUG"
+    image: aduc-sdr-videosuite:latest
+    # (deploy, resources... mantidos como antes)
+    ports:
+      - "7860:7860" # Porta para a UI principal (LTX + SeedVR)
+      - "7861:7861" # Porta para a nova UI do VINCIE
+    volumes:
+      # O volume 'aduc_data' agora armazena tudo: cache, modelos e repos.
+      - aduc_data:/data
+      - ./output:/app/output
+    # O entrypoint cuidará do setup na inicialização.
+    # O CMD padrão iniciará a UI principal. Para VINCIE, usaremos um comando diferente.
+  # Novo serviço para a interface do VINCIE
+  vince-ui:
+    image: aduc-sdr-videosuite:latest # Usa a mesma imagem já construída
+    command: python3 /app/app_vince.py # Sobrescreve o CMD padrão para iniciar a UI do VINCIE
     deploy:
       resources:
         reservations:
           devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
     ports:
+      - "7861:7861"
     volumes:
+      - aduc_data:/data
+      - ./output:/app/output
 volumes:
+  aduc_data:

entrypoint.sh CHANGED Viewed

@@ -1,21 +1,44 @@
-#!/bin/sh
-# entrypoint.sh - Executado como root para corrigir permissões.
 set -e
-echo "🔐 ENTRYPOINT (root): Corrigindo permissões para os diretórios de dados e saída..."
-# Lista de diretórios a serem criados e terem suas permissões ajustadas
-# Usamos os valores padrão, pois as variáveis de ambiente podem não estar disponíveis aqui.
-DIRS_TO_OWN="/app/outputs /app/inputs"
-# Garante que os diretórios existam
-mkdir -p $DIRS_TO_OWN
-# Muda o proprietário para o UID e GID 1000, que corresponde ao 'appuser'
-# Usar UID/GID é mais robusto em ambientes de contêiner.
-chown -R 1000:1000 $DIRS_TO_OWN
-echo "✅ ENTRYPOINT (root): Permissões corrigidas."
-# Passa a execução para o comando principal (CMD) definido no Dockerfile.
 exec "$@"

+#!/bin/bash
 set -e
+echo "🚀 ADUC-SDR Entrypoint: Configurando o ambiente de execução..."
+# --- Configuração de Performance (CPU & GPU) ---
+NUM_VCPUS=$(nproc)
+NUM_GPUS=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -n 1 || echo 0)
+echo "   > Hardware: ${NUM_VCPUS} vCPUs, ${NUM_GPUS} GPUs"
+if [[ ${NUM_GPUS} -gt 0 ]]; then
+    VCPUS_PER_GPU=$((NUM_VCPUS / NUM_GPUS))
+    THREADS_PER_PROCESS=$((VCPUS_PER_GPU / 2))
+else
+    THREADS_PER_PROCESS=$((NUM_VCPUS / 2))
+fi
+MIN_THREADS=4; MAX_THREADS=16
+if [[ ${THREADS_PER_PROCESS} -lt ${MIN_THREADS} ]]; then THREADS_PER_PROCESS=${MIN_THREADS}; fi
+if [[ ${THREADS_PER_PROCESS} -gt ${MAX_THREADS} ]]; then THREADS_PER_PROCESS=${MAX_THREADS}; fi
+export OMP_NUM_THREADS=${OMP_NUM_THREADS:-${THREADS_PER_PROCESS}}
+export MKL_NUM_THREADS=${MKL_NUM_THREADS:-${THREADS_PER_PROCESS}}
+export MAX_JOBS=${MAX_JOBS:-${NUM_VCPUS}}
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-"max_split_size_mb:512"}
+export NVIDIA_TF32_OVERRIDE=${NVIDIA_TF32_OVERRIDE:-1}
+# --- Configuração de Depuração e Logging ---
+export ADUC_LOG_LEVEL=${ADUC_LOG_LEVEL:-"INFO"}
+export CUDA_LAUNCH_BLOCKING=${CUDA_LAUNCH_BLOCKING:-0}
+export PYTHONFAULTHANDLER=1
+export GRADIO_DEBUG=${GRADIO_DEBUG:-"False"}
+echo "   > Performance: OMP_NUM_THREADS=${OMP_NUM_THREADS}, MKL_NUM_THREADS=${MKL_NUM_THREADS}"
+echo "   > Depuração: ADUC_LOG_LEVEL=${ADUC_LOG_LEVEL}, CUDA_LAUNCH_BLOCKING=${CUDA_LAUNCH_BLOCKING}"
+echo ""
+echo ""
+#/bin/bash /app/info.sh
+# --- Setup de Dependências ---
+echo "   > Verificando dependências com setup.py..."
+python3 /app/setup.py
+echo "---------------------------------------------------------"
+echo "🔥 Ambiente configurado. Iniciando o comando principal: $@"
 exec "$@"

pipeline_ltx_video (3).py ADDED Viewed

	@@ -0,0 +1,1895 @@

+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+import copy
+import inspect
+import math
+import re
+from contextlib import nullcontext
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import (
+    T5EncoderModel,
+    T5Tokenizer,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoTokenizer,
+)
+from ltx_video.models.autoencoders.causal_video_autoencoder import (
+    CausalVideoAutoencoder,
+)
+from ltx_video.models.autoencoders.vae_encode import (
+    get_vae_size_scale_factor,
+    latent_to_pixel_coords,
+    vae_decode,
+    vae_encode,
+)
+from ltx_video.models.transformers.symmetric_patchifier import Patchifier
+from ltx_video.models.transformers.transformer3d import Transformer3DModel
+from ltx_video.schedulers.rf import TimestepShifter
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt
+from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
+from ltx_video.models.autoencoders.vae_encode import (
+    un_normalize_latents,
+    normalize_latents,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    skip_initial_inference_steps: int = 0,
+    skip_final_inference_steps: int = 0,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+            timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+            must be `None`.
+        max_timestep ('float', *optional*, defaults to 1.0):
+            The initial noising level for image-to-image/video-to-video. The list if timestamps will be
+            truncated to start with a timestamp greater or equal to this.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        if (
+            skip_initial_inference_steps < 0
+            or skip_final_inference_steps < 0
+            or skip_initial_inference_steps + skip_final_inference_steps
+            >= num_inference_steps
+        ):
+            raise ValueError(
+                "invalid skip inference step values: must be non-negative and the sum of skip_initial_inference_steps and skip_final_inference_steps must be less than the number of inference steps"
+            )
+        timesteps = timesteps[
+            skip_initial_inference_steps : len(timesteps) - skip_final_inference_steps
+        ]
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        num_inference_steps = len(timesteps)
+    return timesteps, num_inference_steps
+@dataclass
+class ConditioningItem:
+    """
+    Defines a single frame-conditioning item - a single frame or a sequence of frames.
+    Attributes:
+        media_item (torch.Tensor): shape=(b, 3, f, h, w). The media item to condition on.
+        media_frame_number (int): The start-frame number of the media item in the generated video.
+        conditioning_strength (float): The strength of the conditioning (1.0 = full conditioning).
+        media_x (Optional[int]): Optional left x coordinate of the media item in the generated frame.
+        media_y (Optional[int]): Optional top y coordinate of the media item in the generated frame.
+    """
+    media_item: torch.Tensor
+    media_frame_number: int
+    conditioning_strength: float
+    media_x: Optional[int] = None
+    media_y: Optional[int] = None
+class LTXVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using LTX-Video.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. This uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`Transformer2DModel`]):
+            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+    _optional_components = [
+        "tokenizer",
+        "text_encoder",
+        "prompt_enhancer_image_caption_model",
+        "prompt_enhancer_image_caption_processor",
+        "prompt_enhancer_llm_model",
+        "prompt_enhancer_llm_tokenizer",
+    ]
+    model_cpu_offload_seq = "prompt_enhancer_image_caption_model->prompt_enhancer_llm_model->text_encoder->transformer->vae"
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: Transformer3DModel,
+        scheduler: DPMSolverMultistepScheduler,
+        patchifier: Patchifier,
+        prompt_enhancer_image_caption_model: AutoModelForCausalLM,
+        prompt_enhancer_image_caption_processor: AutoProcessor,
+        prompt_enhancer_llm_model: AutoModelForCausalLM,
+        prompt_enhancer_llm_tokenizer: AutoTokenizer,
+        allowed_inference_steps: Optional[List[float]] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+            patchifier=patchifier,
+            prompt_enhancer_image_caption_model=prompt_enhancer_image_caption_model,
+            prompt_enhancer_image_caption_processor=prompt_enhancer_image_caption_processor,
+            prompt_enhancer_llm_model=prompt_enhancer_llm_model,
+            prompt_enhancer_llm_tokenizer=prompt_enhancer_llm_tokenizer,
+        )
+        self.video_scale_factor, self.vae_scale_factor, _ = get_vae_size_scale_factor(
+            self.vae
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.allowed_inference_steps = allowed_inference_steps
+    def mask_text_embeddings(self, emb, mask):
+        if emb.shape[0] == 1:
+            keep_index = mask.sum().item()
+            return emb[:, :, :keep_index, :], keep_index
+        else:
+            masked_feature = emb * mask[:, None, :, None]
+            return masked_feature, emb.shape[2]
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        text_encoder_max_tokens: int = 256,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                This should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings.
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        if device is None:
+            device = self._execution_device
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # See Section 3.1. of the paper.
+        max_length = (
+            text_encoder_max_tokens  # TPU supports only lengths multiple of 128
+        )
+        if prompt_embeds is None:
+            assert (
+                self.text_encoder is not None
+            ), "You should provide either prompt_embeds or self.text_encoder should not be None,"
+            text_enc_device = next(self.text_encoder.parameters()).device
+            prompt = self._text_preprocessing(prompt)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(text_enc_device)
+            prompt_attention_mask = prompt_attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(text_enc_device), attention_mask=prompt_attention_mask
+            )
+            prompt_embeds = prompt_embeds[0]
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt)
+        prompt_attention_mask = prompt_attention_mask.view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = self._text_preprocessing(negative_prompt)
+            uncond_tokens = uncond_tokens * batch_size
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(
+                text_enc_device
+            )
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(text_enc_device),
+                attention_mask=negative_prompt_attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(
+                1, num_images_per_prompt
+            )
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(
+                bs_embed * num_images_per_prompt, -1
+            )
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+        return (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+        enhance_prompt=False,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError(
+                "Must provide `prompt_attention_mask` when specifying `prompt_embeds`."
+            )
+        if (
+            negative_prompt_embeds is not None
+            and negative_prompt_attention_mask is None
+        ):
+            raise ValueError(
+                "Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+        if enhance_prompt:
+            assert (
+                self.prompt_enhancer_image_caption_model is not None
+            ), "Image caption model must be initialized if enhance_prompt is True"
+            assert (
+                self.prompt_enhancer_image_caption_processor is not None
+            ), "Image caption processor must be initialized if enhance_prompt is True"
+            assert (
+                self.prompt_enhancer_llm_model is not None
+            ), "Text prompt enhancer model must be initialized if enhance_prompt is True"
+            assert (
+                self.prompt_enhancer_llm_tokenizer is not None
+            ), "Text prompt enhancer tokenizer must be initialized if enhance_prompt is True"
+    def _text_preprocessing(self, text):
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+        def process(text: str):
+            text = text.strip()
+            return text
+        return [process(t) for t in text]
+    @staticmethod
+    def add_noise_to_image_conditioning_latents(
+        t: float,
+        init_latents: torch.Tensor,
+        latents: torch.Tensor,
+        noise_scale: float,
+        conditioning_mask: torch.Tensor,
+        generator,
+        eps=1e-6,
+    ):
+        """
+        Add timestep-dependent noise to the hard-conditioning latents.
+        This helps with motion continuity, especially when conditioned on a single frame.
+        """
+        noise = randn_tensor(
+            latents.shape,
+            generator=generator,
+            device=latents.device,
+            dtype=latents.dtype,
+        )
+        # Add noise only to hard-conditioning latents (conditioning_mask = 1.0)
+        need_to_noise = (conditioning_mask > 1.0 - eps).unsqueeze(-1)
+        noised_latents = init_latents + noise_scale * noise * (t**2)
+        latents = torch.where(need_to_noise, noised_latents, latents)
+        return latents
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        latents: torch.Tensor | None,
+        media_items: torch.Tensor | None,
+        timestep: float,
+        latent_shape: torch.Size | Tuple[Any, ...],
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: torch.Generator | List[torch.Generator],
+        vae_per_channel_normalize: bool = True,
+    ):
+        """
+        Prepare the initial latent tensor to be denoised.
+        The latents are either pure noise or a noised version of the encoded media items.
+        Args:
+            latents (`torch.FloatTensor` or `None`):
+                The latents to use (provided by the user) or `None` to create new latents.
+            media_items (`torch.FloatTensor` or `None`):
+                An image or video to be updated using img2img or vid2vid. The media item is encoded and noised.
+            timestep (`float`):
+                The timestep to noise the encoded media_items to.
+            latent_shape (`torch.Size`):
+                The target latent shape.
+            dtype (`torch.dtype`):
+                The target dtype.
+            device (`torch.device`):
+                The target device.
+            generator (`torch.Generator` or `List[torch.Generator]`):
+                Generator(s) to be used for the noising process.
+            vae_per_channel_normalize ('bool'):
+                When encoding the media_items, whether to normalize the latents per-channel.
+        Returns:
+            `torch.FloatTensor`: The latents to be used for the denoising process. This is a tensor of shape
+            (batch_size, num_channels, height, width).
+        """
+        if isinstance(generator, list) and len(generator) != latent_shape[0]:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {latent_shape[0]}. Make sure the batch size matches the length of the generators."
+            )
+        # Initialize the latents with the given latents or encoded media item, if provided
+        assert (
+            latents is None or media_items is None
+        ), "Cannot provide both latents and media_items. Please provide only one of the two."
+        assert (
+            latents is None and media_items is None or timestep < 1.0
+        ), "Input media_item or latents are provided, but they will be replaced with noise."
+        if media_items is not None:
+            latents = vae_encode(
+                media_items.to(dtype=self.vae.dtype, device=self.vae.device),
+                self.vae,
+                vae_per_channel_normalize=vae_per_channel_normalize,
+            )
+        if latents is not None:
+            assert (
+                latents.shape == latent_shape
+            ), f"Latents have to be of shape {latent_shape} but are {latents.shape}."
+            latents = latents.to(device=device, dtype=dtype)
+        # For backward compatibility, generate in the "patchified" shape and rearrange
+        b, c, f, h, w = latent_shape
+        noise = randn_tensor(
+            (b, f * h * w, c), generator=generator, device=device, dtype=dtype
+        )
+        noise = rearrange(noise, "b (f h w) c -> b c f h w", f=f, h=h, w=w)
+        # scale the initial noise by the standard deviation required by the scheduler
+        noise = noise * self.scheduler.init_noise_sigma
+        if latents is None:
+            latents = noise
+        else:
+            # Noise the latents to the required (first) timestep
+            latents = timestep * noise + (1 - timestep) * latents
+        return latents
+    @staticmethod
+    def classify_height_width_bin(
+        height: int, width: int, ratios: dict
+    ) -> Tuple[int, int]:
+        """Returns binned height and width."""
+        ar = float(height / width)
+        closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+        default_hw = ratios[closest_ratio]
+        return int(default_hw[0]), int(default_hw[1])
+    @staticmethod
+    def resize_and_crop_tensor(
+        samples: torch.Tensor, new_width: int, new_height: int
+    ) -> torch.Tensor:
+        n_frames, orig_height, orig_width = samples.shape[-3:]
+        # Check if resizing is needed
+        if orig_height != new_height or orig_width != new_width:
+            ratio = max(new_height / orig_height, new_width / orig_width)
+            resized_width = int(orig_width * ratio)
+            resized_height = int(orig_height * ratio)
+            # Resize
+            samples = LTXVideoPipeline.resize_tensor(
+                samples, resized_height, resized_width
+            )
+            # Center Crop
+            start_x = (resized_width - new_width) // 2
+            end_x = start_x + new_width
+            start_y = (resized_height - new_height) // 2
+            end_y = start_y + new_height
+            samples = samples[..., start_y:end_y, start_x:end_x]
+        return samples
+    @staticmethod
+    def resize_tensor(media_items, height, width):
+        n_frames = media_items.shape[2]
+        if media_items.shape[-2:] != (height, width):
+            media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
+            media_items = F.interpolate(
+                media_items,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+            )
+            media_items = rearrange(media_items, "(b n) c h w -> b c n h w", n=n_frames)
+        return media_items
+    @torch.no_grad()
+    def __call__(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        skip_initial_inference_steps: int = 0,
+        skip_final_inference_steps: int = 0,
+        timesteps: List[int] = None,
+        guidance_scale: Union[float, List[float]] = 4.5,
+        cfg_star_rescale: bool = False,
+        skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+        skip_block_list: Optional[Union[List[List[int]], List[int]]] = None,
+        stg_scale: Union[float, List[float]] = 1.0,
+        rescaling_scale: Union[float, List[float]] = 0.7,
+        guidance_timesteps: Optional[List[int]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        conditioning_items: Optional[List[ConditioningItem]] = None,
+        decode_timestep: Union[List[float], float] = 0.0,
+        decode_noise_scale: Optional[List[float]] = None,
+        mixed_precision: bool = False,
+        offload_to_cpu: bool = False,
+        enhance_prompt: bool = False,
+        text_encoder_max_tokens: int = 256,
+        stochastic_sampling: bool = False,
+        media_items: Optional[torch.Tensor] = None,
+        tone_map_compression_ratio: float = 0.0,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. If `timesteps` is provided, this parameter is ignored.
+            skip_initial_inference_steps (`int`, *optional*, defaults to 0):
+                The number of initial timesteps to skip. After calculating the timesteps, this number of timesteps will
+                be removed from the beginning of the timesteps list. Meaning the highest-timesteps values will not run.
+            skip_final_inference_steps (`int`, *optional*, defaults to 0):
+                The number of final timesteps to skip. After calculating the timesteps, this number of timesteps will
+                be removed from the end of the timesteps list. Meaning the lowest-timesteps values will not run.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            cfg_star_rescale (`bool`, *optional*, defaults to `False`):
+                If set to `True`, applies the CFG star rescale. Scales the negative prediction according to dot
+                product between positive and negative.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. This negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            enhance_prompt (`bool`, *optional*, defaults to `False`):
+                If set to `True`, the prompt is enhanced using a LLM model.
+            text_encoder_max_tokens (`int`, *optional*, defaults to `256`):
+                The maximum number of tokens to use for the text encoder.
+            stochastic_sampling (`bool`, *optional*, defaults to `False`):
+                If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic.
+            media_items ('torch.Tensor', *optional*):
+                The input media item used for image-to-image / video-to-video.
+            tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
+                        If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
+        Examples:
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        is_video = kwargs.get("is_video", False)
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        self.video_scale_factor = self.video_scale_factor if is_video else 1
+        vae_per_channel_normalize = kwargs.get("vae_per_channel_normalize", True)
+        image_cond_noise_scale = kwargs.get("image_cond_noise_scale", 0.0)
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        latent_num_frames = num_frames // self.video_scale_factor
+        if isinstance(self.vae, CausalVideoAutoencoder) and is_video:
+            latent_num_frames += 1
+        latent_shape = (
+            batch_size * num_images_per_prompt,
+            self.transformer.config.in_channels,
+            latent_num_frames,
+            latent_height,
+            latent_width,
+        )
+        # Prepare the list of denoising time-steps
+        retrieve_timesteps_kwargs = {}
+        if isinstance(self.scheduler, TimestepShifter):
+            retrieve_timesteps_kwargs["samples_shape"] = latent_shape
+        assert (
+            skip_initial_inference_steps == 0
+            or latents is not None
+            or media_items is not None
+        ), (
+            f"skip_initial_inference_steps ({skip_initial_inference_steps}) is used for image-to-image/video-to-video - "
+            "media_item or latents should be provided."
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            skip_initial_inference_steps=skip_initial_inference_steps,
+            skip_final_inference_steps=skip_final_inference_steps,
+            **retrieve_timesteps_kwargs,
+        )
+        if self.allowed_inference_steps is not None:
+            for timestep in [round(x, 4) for x in timesteps.tolist()]:
+                assert (
+                    timestep in self.allowed_inference_steps
+                ), f"Invalid inference timestep {timestep}. Allowed timesteps are {self.allowed_inference_steps}."
+        if guidance_timesteps:
+            guidance_mapping = []
+            for timestep in timesteps:
+                indices = [
+                    i for i, val in enumerate(guidance_timesteps) if val <= timestep
+                ]
+                # assert len(indices) > 0, f"No guidance timestep found for {timestep}"
+                guidance_mapping.append(
+                    indices[0] if len(indices) > 0 else (len(guidance_timesteps) - 1)
+                )
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        if not isinstance(guidance_scale, List):
+            guidance_scale = [guidance_scale] * len(timesteps)
+        else:
+            guidance_scale = [
+                guidance_scale[guidance_mapping[i]] for i in range(len(timesteps))
+            ]
+        if not isinstance(stg_scale, List):
+            stg_scale = [stg_scale] * len(timesteps)
+        else:
+            stg_scale = [stg_scale[guidance_mapping[i]] for i in range(len(timesteps))]
+        if not isinstance(rescaling_scale, List):
+            rescaling_scale = [rescaling_scale] * len(timesteps)
+        else:
+            rescaling_scale = [
+                rescaling_scale[guidance_mapping[i]] for i in range(len(timesteps))
+            ]
+        # Normalize skip_block_list to always be None or a list of lists matching timesteps
+        if skip_block_list is not None:
+            # Convert single list to list of lists if needed
+            if len(skip_block_list) == 0 or not isinstance(skip_block_list[0], list):
+                skip_block_list = [skip_block_list] * len(timesteps)
+            else:
+                new_skip_block_list = []
+                for i, timestep in enumerate(timesteps):
+                    new_skip_block_list.append(skip_block_list[guidance_mapping[i]])
+                skip_block_list = new_skip_block_list
+        if enhance_prompt:
+            self.prompt_enhancer_image_caption_model = (
+                self.prompt_enhancer_image_caption_model.to(self._execution_device)
+            )
+            self.prompt_enhancer_llm_model = self.prompt_enhancer_llm_model.to(
+                self._execution_device
+            )
+            prompt = generate_cinematic_prompt(
+                self.prompt_enhancer_image_caption_model,
+                self.prompt_enhancer_image_caption_processor,
+                self.prompt_enhancer_llm_model,
+                self.prompt_enhancer_llm_tokenizer,
+                prompt,
+                conditioning_items,
+                max_new_tokens=text_encoder_max_tokens,
+            )
+        # 3. Encode input prompt
+        if self.text_encoder is not None:
+            self.text_encoder = self.text_encoder.to(self._execution_device)
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            True,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            text_encoder_max_tokens=text_encoder_max_tokens,
+        )
+        if offload_to_cpu and self.text_encoder is not None:
+            self.text_encoder = self.text_encoder.cpu()
+        self.transformer = self.transformer.to(self._execution_device)
+        prompt_embeds_batch = prompt_embeds
+        prompt_attention_mask_batch = prompt_attention_mask
+        negative_prompt_embeds = (
+            torch.zeros_like(prompt_embeds)
+            if negative_prompt_embeds is None
+            else negative_prompt_embeds
+        )
+        negative_prompt_attention_mask = (
+            torch.zeros_like(prompt_attention_mask)
+            if negative_prompt_attention_mask is None
+            else negative_prompt_attention_mask
+        )
+        prompt_embeds_batch = torch.cat(
+            [negative_prompt_embeds, prompt_embeds, prompt_embeds], dim=0
+        )
+        prompt_attention_mask_batch = torch.cat(
+            [
+                negative_prompt_attention_mask,
+                prompt_attention_mask,
+                prompt_attention_mask,
+            ],
+            dim=0,
+        )
+        # 4. Prepare the initial latents using the provided media and conditioning items
+        # Prepare the initial latents tensor, shape = (b, c, f, h, w)
+        latents = self.prepare_latents(
+            latents=latents,
+            media_items=media_items,
+            timestep=timesteps[0],
+            latent_shape=latent_shape,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            vae_per_channel_normalize=vae_per_channel_normalize,
+        )
+        # Update the latents with the conditioning items and patchify them into (b, n, c)
+        latents, pixel_coords, conditioning_mask, num_cond_latents = (
+            self.prepare_conditioning(
+                conditioning_items=conditioning_items,
+                init_latents=latents,
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                vae_per_channel_normalize=vae_per_channel_normalize,
+                generator=generator,
+            )
+        )
+        init_latents = latents.clone()  # Used for image_cond_noise_update
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        orig_conditioning_mask = conditioning_mask
+        # Befor compiling this code please be aware:
+        # This code might generate different input shapes if some timesteps have no STG or CFG.
+        # This means that the codes might need to be compiled mutliple times.
+        # To avoid that, use the same STG and CFG values for all timesteps.
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                do_classifier_free_guidance = guidance_scale[i] > 1.0
+                do_spatio_temporal_guidance = stg_scale[i] > 0
+                do_rescaling = rescaling_scale[i] != 1.0
+                num_conds = 1
+                if do_classifier_free_guidance:
+                    num_conds += 1
+                if do_spatio_temporal_guidance:
+                    num_conds += 1
+                if do_classifier_free_guidance and do_spatio_temporal_guidance:
+                    indices = slice(batch_size * 0, batch_size * 3)
+                elif do_classifier_free_guidance:
+                    indices = slice(batch_size * 0, batch_size * 2)
+                elif do_spatio_temporal_guidance:
+                    indices = slice(batch_size * 1, batch_size * 3)
+                else:
+                    indices = slice(batch_size * 1, batch_size * 2)
+                # Prepare skip layer masks
+                skip_layer_mask: Optional[torch.Tensor] = None
+                if do_spatio_temporal_guidance:
+                    if skip_block_list is not None:
+                        skip_layer_mask = self.transformer.create_skip_layer_mask(
+                            batch_size, num_conds, num_conds - 1, skip_block_list[i]
+                        )
+                batch_pixel_coords = torch.cat([pixel_coords] * num_conds)
+                conditioning_mask = orig_conditioning_mask
+                if conditioning_mask is not None and is_video:
+                    assert num_images_per_prompt == 1
+                    conditioning_mask = torch.cat([conditioning_mask] * num_conds)
+                fractional_coords = batch_pixel_coords.to(torch.float32)
+                fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)
+                if conditioning_mask is not None and image_cond_noise_scale > 0.0:
+                    latents = self.add_noise_to_image_conditioning_latents(
+                        t,
+                        init_latents,
+                        latents,
+                        image_cond_noise_scale,
+                        orig_conditioning_mask,
+                        generator,
+                    )
+                latent_model_input = (
+                    torch.cat([latents] * num_conds) if num_conds > 1 else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor(
+                        [current_timestep],
+                        dtype=dtype,
+                        device=latent_model_input.device,
+                    )
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(
+                        latent_model_input.device
+                    )
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(
+                    latent_model_input.shape[0]
+                ).unsqueeze(-1)
+                if conditioning_mask is not None:
+                    # Conditioning latents have an initial timestep and noising level of (1.0 - conditioning_mask)
+                    # and will start to be denoised when the current timestep is lower than their conditioning timestep.
+                    current_timestep = torch.min(
+                        current_timestep, 1.0 - conditioning_mask
+                    )
+                # Choose the appropriate context manager based on `mixed_precision`
+                if mixed_precision:
+                    context_manager = torch.autocast(device.type, dtype=torch.bfloat16)
+                else:
+                    context_manager = nullcontext()  # Dummy context manager
+                # predict noise model_output
+                with context_manager:
+                    noise_pred = self.transformer(
+                        latent_model_input.to(self.transformer.dtype),
+                        indices_grid=fractional_coords,
+                        encoder_hidden_states=prompt_embeds_batch[indices].to(
+                            self.transformer.dtype
+                        ),
+                        encoder_attention_mask=prompt_attention_mask_batch[indices],
+                        timestep=current_timestep,
+                        skip_layer_mask=skip_layer_mask,
+                        skip_layer_strategy=skip_layer_strategy,
+                        return_dict=False,
+                    )[0]
+                # perform guidance
+                if do_spatio_temporal_guidance:
+                    noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(
+                        num_conds
+                    )[-2:]
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(num_conds)[:2]
+                    if cfg_star_rescale:
+                        # Rescales the unconditional noise prediction using the projection of the conditional prediction onto it:
+                        # α = (⟨ε_text, ε_uncond⟩ / ||ε_uncond||²), then ε_uncond ← α * ε_uncond
+                        # where ε_text is the conditional noise prediction and ε_uncond is the unconditional one.
+                        positive_flat = noise_pred_text.view(batch_size, -1)
+                        negative_flat = noise_pred_uncond.view(batch_size, -1)
+                        dot_product = torch.sum(
+                            positive_flat * negative_flat, dim=1, keepdim=True
+                        )
+                        squared_norm = (
+                            torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+                        )
+                        alpha = dot_product / squared_norm
+                        noise_pred_uncond = alpha * noise_pred_uncond
+                    noise_pred = noise_pred_uncond + guidance_scale[i] * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                elif do_spatio_temporal_guidance:
+                    noise_pred = noise_pred_text
+                if do_spatio_temporal_guidance:
+                    noise_pred = noise_pred + stg_scale[i] * (
+                        noise_pred_text - noise_pred_text_perturb
+                    )
+                    if do_rescaling and stg_scale[i] > 0.0:
+                        noise_pred_text_std = noise_pred_text.view(batch_size, -1).std(
+                            dim=1, keepdim=True
+                        )
+                        noise_pred_std = noise_pred.view(batch_size, -1).std(
+                            dim=1, keepdim=True
+                        )
+                        factor = noise_pred_text_std / noise_pred_std
+                        factor = rescaling_scale[i] * factor + (1 - rescaling_scale[i])
+                        noise_pred = noise_pred * factor.view(batch_size, 1, 1)
+                current_timestep = current_timestep[:1]
+                # learned sigma
+                if (
+                    self.transformer.config.out_channels // 2
+                    == self.transformer.config.in_channels
+                ):
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                # compute previous image: x_t -> x_t-1
+                latents = self.denoising_step(
+                    latents,
+                    noise_pred,
+                    current_timestep,
+                    orig_conditioning_mask,
+                    t,
+                    extra_step_kwargs,
+                    stochastic_sampling=stochastic_sampling,
+                )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                if callback_on_step_end is not None:
+                    callback_on_step_end(self, i, t, {})
+        if offload_to_cpu:
+            self.transformer = self.transformer.cpu()
+            if self._execution_device == "cuda":
+                torch.cuda.empty_cache()
+        # Remove the added conditioning latents
+        latents = latents[:, num_cond_latents:]
+        latents = self.patchifier.unpatchify(
+            latents=latents,
+            output_height=latent_height,
+            output_width=latent_width,
+            out_channels=self.transformer.in_channels
+            // math.prod(self.patchifier.patch_size),
+        )
+        if output_type != "latent":
+            if self.vae.decoder.timestep_conditioning:
+                noise = torch.randn_like(latents)
+                if not isinstance(decode_timestep, list):
+                    decode_timestep = [decode_timestep] * latents.shape[0]
+                if decode_noise_scale is None:
+                    decode_noise_scale = decode_timestep
+                elif not isinstance(decode_noise_scale, list):
+                    decode_noise_scale = [decode_noise_scale] * latents.shape[0]
+                decode_timestep = torch.tensor(decode_timestep).to(latents.device)
+                decode_noise_scale = torch.tensor(decode_noise_scale).to(
+                    latents.device
+                )[:, None, None, None, None]
+                latents = (
+                    latents * (1 - decode_noise_scale) + noise * decode_noise_scale
+                )
+            else:
+                decode_timestep = None
+            latents = self.tone_map_latents(latents, tone_map_compression_ratio)
+            image = vae_decode(
+                latents,
+                self.vae,
+                is_video,
+                vae_per_channel_normalize=kwargs["vae_per_channel_normalize"],
+                timestep=decode_timestep,
+            )
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)
+    def denoising_step(
+        self,
+        latents: torch.Tensor,
+        noise_pred: torch.Tensor,
+        current_timestep: torch.Tensor,
+        conditioning_mask: torch.Tensor,
+        t: float,
+        extra_step_kwargs,
+        t_eps=1e-6,
+        stochastic_sampling=False,
+    ):
+        """
+        Perform the denoising step for the required tokens, based on the current timestep and
+        conditioning mask:
+        Conditioning latents have an initial timestep and noising level of (1.0 - conditioning_mask)
+        and will start to be denoised when the current timestep is equal or lower than their
+        conditioning timestep.
+        (hard-conditioning latents with conditioning_mask = 1.0 are never denoised)
+        """
+        # Denoise the latents using the scheduler
+        denoised_latents = self.scheduler.step(
+            noise_pred,
+            t if current_timestep is None else current_timestep,
+            latents,
+            **extra_step_kwargs,
+            return_dict=False,
+            stochastic_sampling=stochastic_sampling,
+        )[0]
+        if conditioning_mask is None:
+            return denoised_latents
+        tokens_to_denoise_mask = (t - t_eps < (1.0 - conditioning_mask)).unsqueeze(-1)
+        return torch.where(tokens_to_denoise_mask, denoised_latents, latents)
+    def prepare_conditioning(
+        self,
+        conditioning_items: Optional[List[ConditioningItem]],
+        init_latents: torch.Tensor,
+        num_frames: int,
+        height: int,
+        width: int,
+        vae_per_channel_normalize: bool = False,
+        generator=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+        """
+        Prepare conditioning tokens based on the provided conditioning items.
+        This method encodes provided conditioning items (video frames or single frames) into latents
+        and integrates them with the initial latent tensor. It also calculates corresponding pixel
+        coordinates, a mask indicating the influence of conditioning latents, and the total number of
+        conditioning latents.
+        Args:
+            conditioning_items (Optional[List[ConditioningItem]]): A list of ConditioningItem objects.
+            init_latents (torch.Tensor): The initial latent tensor of shape (b, c, f_l, h_l, w_l), where
+                `f_l` is the number of latent frames, and `h_l` and `w_l` are latent spatial dimensions.
+            num_frames, height, width: The dimensions of the generated video.
+            vae_per_channel_normalize (bool, optional): Whether to normalize channels during VAE encoding.
+                Defaults to `False`.
+            generator: The random generator
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+                - `init_latents` (torch.Tensor): The updated latent tensor including conditioning latents,
+                  patchified into (b, n, c) shape.
+                - `init_pixel_coords` (torch.Tensor): The pixel coordinates corresponding to the updated
+                  latent tensor.
+                - `conditioning_mask` (torch.Tensor): A mask indicating the conditioning-strength of each
+                  latent token.
+                - `num_cond_latents` (int): The total number of latent tokens added from conditioning items.
+        Raises:
+            AssertionError: If input shapes, dimensions, or conditions for applying conditioning are invalid.
+        """
+        assert isinstance(self.vae, CausalVideoAutoencoder)
+        if conditioning_items:
+            batch_size, _, num_latent_frames = init_latents.shape[:3]
+            init_conditioning_mask = torch.zeros(
+                init_latents[:, 0, :, :, :].shape,
+                dtype=torch.float32,
+                device=init_latents.device,
+            )
+            extra_conditioning_latents = []
+            extra_conditioning_pixel_coords = []
+            extra_conditioning_mask = []
+            extra_conditioning_num_latents = 0  # Number of extra conditioning latents added (should be removed before decoding)
+            # Process each conditioning item
+            for conditioning_item in conditioning_items:
+                conditioning_item = self._resize_conditioning_item(
+                    conditioning_item, height, width
+                )
+                media_item = conditioning_item.media_item
+                media_frame_number = conditioning_item.media_frame_number
+                strength = conditioning_item.conditioning_strength
+                assert media_item.ndim == 5  # (b, c, f, h, w)
+                b, c, n_frames, h, w = media_item.shape
+                assert (
+                    height == h and width == w
+                ) or media_frame_number == 0, f"Dimensions do not match: {height}x{width} != {h}x{w} - allowed only when media_frame_number == 0"
+                assert n_frames % 8 == 1
+                assert (
+                    media_frame_number >= 0
+                    and media_frame_number + n_frames <= num_frames
+                )
+                # Encode the provided conditioning media item
+                media_item_latents = vae_encode(
+                    media_item.to(dtype=self.vae.dtype, device=self.vae.device),
+                    self.vae,
+                    vae_per_channel_normalize=vae_per_channel_normalize,
+                ).to(dtype=init_latents.dtype)
+                # Handle the different conditioning cases
+                if media_frame_number == 0:
+                    # Get the target spatial position of the latent conditioning item
+                    media_item_latents, l_x, l_y = self._get_latent_spatial_position(
+                        media_item_latents,
+                        conditioning_item,
+                        height,
+                        width,
+                        strip_latent_border=True,
+                    )
+                    b, c_l, f_l, h_l, w_l = media_item_latents.shape
+                    # First frame or sequence - just update the initial noise latents and the mask
+                    init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l] = (
+                        torch.lerp(
+                            init_latents[:, :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l],
+                            media_item_latents,
+                            strength,
+                        )
+                    )
+                    init_conditioning_mask[
+                        :, :f_l, l_y : l_y + h_l, l_x : l_x + w_l
+                    ] = strength
+                else:
+                    # Non-first frame or sequence
+                    if n_frames > 1:
+                        # Handle non-first sequence.
+                        # Encoded latents are either fully consumed, or the prefix is handled separately below.
+                        (
+                            init_latents,
+                            init_conditioning_mask,
+                            media_item_latents,
+                        ) = self._handle_non_first_conditioning_sequence(
+                            init_latents,
+                            init_conditioning_mask,
+                            media_item_latents,
+                            media_frame_number,
+                            strength,
+                        )
+                    # Single frame or sequence-prefix latents
+                    if media_item_latents is not None:
+                        noise = randn_tensor(
+                            media_item_latents.shape,
+                            generator=generator,
+                            device=media_item_latents.device,
+                            dtype=media_item_latents.dtype,
+                        )
+                        media_item_latents = torch.lerp(
+                            noise, media_item_latents, strength
+                        )
+                        # Patchify the extra conditioning latents and calculate their pixel coordinates
+                        media_item_latents, latent_coords = self.patchifier.patchify(
+                            latents=media_item_latents
+                        )
+                        pixel_coords = latent_to_pixel_coords(
+                            latent_coords,
+                            self.vae,
+                            causal_fix=self.transformer.config.causal_temporal_positioning,
+                        )
+                        # Update the frame numbers to match the target frame number
+                        pixel_coords[:, 0] += media_frame_number
+                        extra_conditioning_num_latents += media_item_latents.shape[1]
+                        conditioning_mask = torch.full(
+                            media_item_latents.shape[:2],
+                            strength,
+                            dtype=torch.float32,
+                            device=init_latents.device,
+                        )
+                        extra_conditioning_latents.append(media_item_latents)
+                        extra_conditioning_pixel_coords.append(pixel_coords)
+                        extra_conditioning_mask.append(conditioning_mask)
+        # Patchify the updated latents and calculate their pixel coordinates
+        init_latents, init_latent_coords = self.patchifier.patchify(
+            latents=init_latents
+        )
+        init_pixel_coords = latent_to_pixel_coords(
+            init_latent_coords,
+            self.vae,
+            causal_fix=self.transformer.config.causal_temporal_positioning,
+        )
+        if not conditioning_items:
+            return init_latents, init_pixel_coords, None, 0
+        init_conditioning_mask, _ = self.patchifier.patchify(
+            latents=init_conditioning_mask.unsqueeze(1)
+        )
+        init_conditioning_mask = init_conditioning_mask.squeeze(-1)
+        if extra_conditioning_latents:
+            # Stack the extra conditioning latents, pixel coordinates and mask
+            init_latents = torch.cat([*extra_conditioning_latents, init_latents], dim=1)
+            init_pixel_coords = torch.cat(
+                [*extra_conditioning_pixel_coords, init_pixel_coords], dim=2
+            )
+            init_conditioning_mask = torch.cat(
+                [*extra_conditioning_mask, init_conditioning_mask], dim=1
+            )
+            if self.transformer.use_tpu_flash_attention:
+                # When flash attention is used, keep the original number of tokens by removing
+                #   tokens from the end.
+                init_latents = init_latents[:, :-extra_conditioning_num_latents]
+                init_pixel_coords = init_pixel_coords[
+                    :, :, :-extra_conditioning_num_latents
+                ]
+                init_conditioning_mask = init_conditioning_mask[
+                    :, :-extra_conditioning_num_latents
+                ]
+        return (
+            init_latents,
+            init_pixel_coords,
+            init_conditioning_mask,
+            extra_conditioning_num_latents,
+        )
+    @staticmethod
+    def _resize_conditioning_item(
+        conditioning_item: ConditioningItem,
+        height: int,
+        width: int,
+    ):
+        if conditioning_item.media_x or conditioning_item.media_y:
+            raise ValueError(
+                "Provide media_item in the target size for spatial conditioning."
+            )
+        new_conditioning_item = copy.copy(conditioning_item)
+        new_conditioning_item.media_item = LTXVideoPipeline.resize_tensor(
+            conditioning_item.media_item, height, width
+        )
+        return new_conditioning_item
+    def _get_latent_spatial_position(
+        self,
+        latents: torch.Tensor,
+        conditioning_item: ConditioningItem,
+        height: int,
+        width: int,
+        strip_latent_border,
+    ):
+        """
+        Get the spatial position of the conditioning item in the latent space.
+        If requested, strip the conditioning latent borders that do not align with target borders.
+        (border latents look different then other latents and might confuse the model)
+        """
+        scale = self.vae_scale_factor
+        h, w = conditioning_item.media_item.shape[-2:]
+        assert (
+            h <= height and w <= width
+        ), f"Conditioning item size {h}x{w} is larger than target size {height}x{width}"
+        assert h % scale == 0 and w % scale == 0
+        # Compute the start and end spatial positions of the media item
+        x_start, y_start = conditioning_item.media_x, conditioning_item.media_y
+        x_start = (width - w) // 2 if x_start is None else x_start
+        y_start = (height - h) // 2 if y_start is None else y_start
+        x_end, y_end = x_start + w, y_start + h
+        assert (
+            x_end <= width and y_end <= height
+        ), f"Conditioning item {x_start}:{x_end}x{y_start}:{y_end} is out of bounds for target size {width}x{height}"
+        if strip_latent_border:
+            # Strip one latent from left/right and/or top/bottom, update x, y accordingly
+            if x_start > 0:
+                x_start += scale
+                latents = latents[:, :, :, :, 1:]
+            if y_start > 0:
+                y_start += scale
+                latents = latents[:, :, :, 1:, :]
+            if x_end < width:
+                latents = latents[:, :, :, :, :-1]
+            if y_end < height:
+                latents = latents[:, :, :, :-1, :]
+        return latents, x_start // scale, y_start // scale
+    @staticmethod
+    def _handle_non_first_conditioning_sequence(
+        init_latents: torch.Tensor,
+        init_conditioning_mask: torch.Tensor,
+        latents: torch.Tensor,
+        media_frame_number: int,
+        strength: float,
+        num_prefix_latent_frames: int = 2,
+        prefix_latents_mode: str = "concat",
+        prefix_soft_conditioning_strength: float = 0.15,
+    ):
+        """
+        Special handling for a conditioning sequence that does not start on the first frame.
+        The special handling is required to allow a short encoded video to be used as middle
+        (or last) sequence in a longer video.
+        Args:
+            init_latents (torch.Tensor): The initial noise latents to be updated.
+            init_conditioning_mask (torch.Tensor): The initial conditioning mask to be updated.
+            latents (torch.Tensor): The encoded conditioning item.
+            media_frame_number (int): The target frame number of the first frame in the conditioning sequence.
+            strength (float): The conditioning strength for the conditioning latents.
+            num_prefix_latent_frames (int, optional): The length of the sequence prefix, to be handled
+                separately. Defaults to 2.
+            prefix_latents_mode (str, optional): Special treatment for prefix (boundary) latents.
+                - "drop": Drop the prefix latents.
+                - "soft": Use the prefix latents, but with soft-conditioning
+                - "concat": Add the prefix latents as extra tokens (like single frames)
+            prefix_soft_conditioning_strength (float, optional): The strength of the soft-conditioning for
+                the prefix latents, relevant if `prefix_latents_mode` is "soft". Defaults to 0.1.
+        """
+        f_l = latents.shape[2]
+        f_l_p = num_prefix_latent_frames
+        assert f_l >= f_l_p
+        assert media_frame_number % 8 == 0
+        if f_l > f_l_p:
+            # Insert the conditioning latents **excluding the prefix** into the sequence
+            f_l_start = media_frame_number // 8 + f_l_p
+            f_l_end = f_l_start + f_l - f_l_p
+            init_latents[:, :, f_l_start:f_l_end] = torch.lerp(
+                init_latents[:, :, f_l_start:f_l_end],
+                latents[:, :, f_l_p:],
+                strength,
+            )
+            # Mark these latent frames as conditioning latents
+            init_conditioning_mask[:, f_l_start:f_l_end] = strength
+        # Handle the prefix-latents
+        if prefix_latents_mode == "soft":
+            if f_l_p > 1:
+                # Drop the first (single-frame) latent and soft-condition the remaining prefix
+                f_l_start = media_frame_number // 8 + 1
+                f_l_end = f_l_start + f_l_p - 1
+                strength = min(prefix_soft_conditioning_strength, strength)
+                init_latents[:, :, f_l_start:f_l_end] = torch.lerp(
+                    init_latents[:, :, f_l_start:f_l_end],
+                    latents[:, :, 1:f_l_p],
+                    strength,
+                )
+                # Mark these latent frames as conditioning latents
+                init_conditioning_mask[:, f_l_start:f_l_end] = strength
+            latents = None  # No more latents to handle
+        elif prefix_latents_mode == "drop":
+            # Drop the prefix latents
+            latents = None
+        elif prefix_latents_mode == "concat":
+            # Pass-on the prefix latents to be handled as extra conditioning frames
+            latents = latents[:, :, :f_l_p]
+        else:
+            raise ValueError(f"Invalid prefix_latents_mode: {prefix_latents_mode}")
+        return (
+            init_latents,
+            init_conditioning_mask,
+            latents,
+        )
+    def trim_conditioning_sequence(
+        self, start_frame: int, sequence_num_frames: int, target_num_frames: int
+    ):
+        """
+        Trim a conditioning sequence to the allowed number of frames.
+        Args:
+            start_frame (int): The target frame number of the first frame in the sequence.
+            sequence_num_frames (int): The number of frames in the sequence.
+            target_num_frames (int): The target number of frames in the generated video.
+        Returns:
+            int: updated sequence length
+        """
+        scale_factor = self.video_scale_factor
+        num_frames = min(sequence_num_frames, target_num_frames - start_frame)
+        # Trim down to a multiple of temporal_scale_factor frames plus 1
+        num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
+        return num_frames
+    @staticmethod
+    def tone_map_latents(
+        latents: torch.Tensor,
+        compression: float,
+    ) -> torch.Tensor:
+        """
+        Applies a non-linear tone-mapping function to latent values to reduce their dynamic range
+        in a perceptually smooth way using a sigmoid-based compression.
+        This is useful for regularizing high-variance latents or for conditioning outputs
+        during generation, especially when controlling dynamic behavior with a `compression` factor.
+        Parameters:
+        ----------
+        latents : torch.Tensor
+            Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
+        compression : float
+            Compression strength in the range [0, 1].
+            - 0.0: No tone-mapping (identity transform)
+            - 1.0: Full compression effect
+        Returns:
+        -------
+        torch.Tensor
+            The tone-mapped latent tensor of the same shape as input.
+        """
+        if not (0 <= compression <= 1):
+            raise ValueError("Compression must be in the range [0, 1]")
+        # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot
+        scale_factor = compression * 0.75
+        abs_latents = torch.abs(latents)
+        # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0
+        # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect
+        sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0))
+        scales = 1.0 - 0.8 * scale_factor * sigmoid_term
+        filtered = latents * scales
+        return filtered
+def adain_filter_latent(
+    latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
+):
+    """
+    Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on
+    statistics from a reference latent tensor.
+    Args:
+        latent (torch.Tensor): Input latents to normalize
+        reference_latent (torch.Tensor): The reference latents providing style statistics.
+        factor (float): Blending factor between original and transformed latent.
+                       Range: -10.0 to 10.0, Default: 1.0
+    Returns:
+        torch.Tensor: The transformed latent tensor
+    """
+    result = latents.clone()
+    for i in range(latents.size(0)):
+        for c in range(latents.size(1)):
+            r_sd, r_mean = torch.std_mean(
+                reference_latents[i, c], dim=None
+            )  # index by original dim order
+            i_sd, i_mean = torch.std_mean(result[i, c], dim=None)
+            result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean
+    result = torch.lerp(latents, result, factor)
+    return result
+class LTXMultiScalePipeline:
+    def _upsample_latents(
+        self, latest_upsampler: LatentUpsampler, latents: torch.Tensor
+    ):
+        assert latents.device == latest_upsampler.device
+        latents = un_normalize_latents(
+            latents, self.vae, vae_per_channel_normalize=True
+        )
+        upsampled_latents = latest_upsampler(latents)
+        upsampled_latents = normalize_latents(
+            upsampled_latents, self.vae, vae_per_channel_normalize=True
+        )
+        return upsampled_latents
+    def __init__(
+        self, video_pipeline: LTXVideoPipeline, latent_upsampler: LatentUpsampler
+    ):
+        self.video_pipeline = video_pipeline
+        self.vae = video_pipeline.vae
+        self.latent_upsampler = latent_upsampler
+    def __call__(
+        self,
+        downscale_factor: float,
+        first_pass: dict,
+        second_pass: dict,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Any:
+        original_kwargs = kwargs.copy()
+        original_output_type = kwargs["output_type"]
+        original_width = kwargs["width"]
+        original_height = kwargs["height"]
+        x_width = int(kwargs["width"] * downscale_factor)
+        downscaled_width = x_width - (x_width % self.video_pipeline.vae_scale_factor)
+        x_height = int(kwargs["height"] * downscale_factor)
+        downscaled_height = x_height - (x_height % self.video_pipeline.vae_scale_factor)
+        kwargs["output_type"] = "latent"
+        kwargs["width"] = downscaled_width
+        kwargs["height"] = downscaled_height
+        kwargs.update(**first_pass)
+        result = self.video_pipeline(*args, **kwargs)
+        latents = result.images
+        upsampled_latents = self._upsample_latents(self.latent_upsampler, latents)
+        upsampled_latents = adain_filter_latent(
+            latents=upsampled_latents, reference_latents=latents
+        )
+        kwargs = original_kwargs
+        kwargs["latents"] = upsampled_latents
+        kwargs["output_type"] = original_output_type
+        kwargs["width"] = downscaled_width * 2
+        kwargs["height"] = downscaled_height * 2
+        kwargs.update(**second_pass)
+        result = self.video_pipeline(*args, **kwargs)
+        if original_output_type != "latent":
+            num_frames = result.images.shape[2]
+            videos = rearrange(result.images, "b c f h w -> (b f) c h w")
+            videos = F.interpolate(
+                videos,
+                size=(original_height, original_width),
+                mode="bilinear",
+                align_corners=False,
+            )
+            videos = rearrange(videos, "(b f) c h w -> b c f h w", f=num_frames)
+            result.images = videos
+        return result

requirements.txt CHANGED Viewed

@@ -12,7 +12,7 @@ imageio-ffmpeg
 einops
 timm
 av
-flash-attn-3@https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
 git+https://github.com/huggingface/diffusers.git@main

 einops
 timm
 av
+#flash-attn-3@https://huggingface.co/alexnasa/flash-attn-3/resolve/main/128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
 git+https://github.com/huggingface/diffusers.git@main

setup.py CHANGED Viewed

@@ -2,179 +2,173 @@
 #
 # Copyright (C) August 4, 2025  Carlos Rodrigues dos Santos
 #
-# Versão 2.3.0 (Setup Robusto e Idempotente)
-# - Verifica a existência de repositórios e arquivos de modelo antes de baixar.
-# - Pula downloads se os artefatos já existirem, sem gerar erros.
-# - Unifica o download de todas as dependências (Git, LTX Models, SeedVR Models).
 import os
 import subprocess
 import sys
 from pathlib import Path
 import yaml
-from huggingface_hub import hf_hub_download
-# --- Configuração Geral ---
 DEPS_DIR = Path("/data")
-# --- Configuração Específica LTX-Video ---
 LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
-# --- Configuração Específica SeedVR ---
-SEEDVR_MODELS_DIR = DEPS_DIR / "SeedVR"
-# --- Repositórios para Clonar ---
 REPOS_TO_CLONE = {
     "LTX-Video": "https://huggingface.co/spaces/Lightricks/ltx-video-distilled",
     "SeedVR": "https://github.com/numz/ComfyUI-SeedVR2_VideoUpscaler",
-    "MMAudio": "https://github.com/hkchengrex/MMAudio.git"
 }
 def run_command(command, cwd=None):
-    """Executa um comando no terminal e lida com erros."""
     print(f"Executando: {' '.join(command)}")
     try:
         subprocess.run(
-            command,
-            check=True,
-            cwd=cwd,
-            stdin=subprocess.DEVNULL,
         )
     except subprocess.CalledProcessError as e:
-        print(f"ERRO: O comando falhou com o código de saída {e.returncode}\nStderr: {e.stderr}")
         sys.exit(1)
     except FileNotFoundError:
-        print(f"ERRO: O comando '{command[0]}' não foi encontrado. Certifique-se de que o git está instalado e no seu PATH.")
         sys.exit(1)
-# --- Funções de Download (LTX-Video) ---
 def _load_ltx_config():
     """Carrega o arquivo de configuração YAML do LTX-Video."""
     print("--- Carregando Configuração do LTX-Video ---")
-    base = LTX_VIDEO_REPO_DIR / "configs"
-    candidates = [
-        base / "ltxv-13b-0.9.8-dev-fp8.yaml",
-        base / "ltxv-13b-0.9.8-distilled-fp8.yaml",
-        base / "ltxv-13b-0.9.8-distilled.yaml",
-    ]
-    for cfg_path in candidates:
-        if cfg_path.exists():
-            print(f"Configuração encontrada: {cfg_path}")
-            with open(cfg_path, "r") as file:
-                return yaml.safe_load(file)
-    fallback_path = base / "ltxv-13b-0.9.8-distilled-fp8.yaml"
-    print(f"AVISO: Nenhuma configuração preferencial encontrada. Usando fallback: {fallback_path}")
-    if not fallback_path.exists():
-        print(f"ERRO: Arquivo de configuração fallback '{fallback_path}' não encontrado.")
         return None
-    with open(fallback_path, "r") as file:
         return yaml.safe_load(file)
-def _download_ltx_models(config):
-    """Baixa os modelos principais do LTX-Video, pulando os que já existem."""
-    print("\n--- Verificando Modelos do LTX-Video ---")
-    LTX_REPO = "Lightricks/LTX-Video"
-    if "checkpoint_path" not in config or "spatial_upscaler_model_path" not in config:
-        print("ERRO: Chaves de modelo não encontradas no arquivo de configuração do LTX.")
-        sys.exit(1)
-    models_to_download = {
-        config["checkpoint_path"]: "checkpoint principal",
-        config["spatial_upscaler_model_path"]: "upscaler espacial"
-    }
-    # O hf_hub_download já verifica o cache, mas vamos verificar o diretório final para clareza
-    # e para garantir que a lógica seja explícita.
-    for filename, description in models_to_download.items():
-        # A biblioteca huggingface_hub gerencia o local exato, então confiamos nela.
-        # A verificação aqui é para garantir que o download seja tentado.
-        print(f"Garantindo a existência do {description}: {filename}...")
-        try:
-            hf_hub_download(
-                repo_id=LTX_REPO, filename=filename,
-                local_dir=os.getenv("HF_HOME"), cache_dir=os.getenv("HF_HOME_CACHE"), token=os.getenv("HF_TOKEN")
-            )
-            print(f"{description.capitalize()} está disponível.")
-        except Exception as e:
-            print(f"ERRO ao baixar o {description}: {e}")
-            sys.exit(1)
-def _download_seedvr_models():
-    """Baixa os modelos do SeedVR, pulando os que já existem."""
-    print(f"\n--- Verificando Checkpoints do SeedVR em {SEEDVR_MODELS_DIR} ---")
-    SEEDVR_MODELS_DIR.mkdir(exist_ok=True)
-    model_files = {
-        "seedvr2_ema_7b_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
-        "seedvr2_ema_7b_sharp_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
-        "seedvr2_ema_3b_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
-        "ema_vae_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
-        "pos_emb.pt": "ByteDance-Seed/SeedVR2-3B",
-        "neg_emb.pt": "ByteDance-Seed/SeedVR2-3B"
-    }
-    for filename, repo_id in model_files.items():
-        local_path = SEEDVR_MODELS_DIR / filename
-        if not local_path.is_file(): # Verifica se é um arquivo
-            print(f"Baixando {filename} de {repo_id}...")
-            try:
                 hf_hub_download(
-                    repo_id=repo_id,
-                    filename=filename,
-                    local_dir=str(SEEDVR_MODELS_DIR),
-                    cache_dir=os.getenv("HF_HOME_CACHE"),
                     token=os.getenv("HF_TOKEN"),
                 )
-                print(f"'{filename}' baixado com sucesso.")
-            except Exception as e:
-                print(f"ERRO ao baixar o modelo SeedVR '{filename}': {e}")
-                sys.exit(1)
-        else:
-            print(f"Arquivo '{filename}' já existe. Pulando.")
-    print("Checkpoints do SeedVR estão no local correto.")
-# --- Função Principal ---
 def main():
-    print("--- Iniciando Setup do Ambiente ADUC-SDR (Versão Robusta) ---")
     DEPS_DIR.mkdir(exist_ok=True)
     # --- ETAPA 1: Clonar Repositórios ---
-    print("\n--- ETAPA 1: Clonando Repositórios Git ---")
     for repo_name, repo_url in REPOS_TO_CLONE.items():
         repo_path = DEPS_DIR / repo_name
-        if repo_path.is_dir(): # Verifica se é um diretório
-            print(f"Repositório '{repo_name}' já existe. Pulando.")
         else:
             print(f"Clonando '{repo_name}' de {repo_url}...")
             run_command(["git", "clone", "--depth", "1", repo_url, str(repo_path)])
-            print(f"'{repo_name}' clonado com sucesso.")
-    # --- ETAPA 2: Baixar Modelos do LTX-Video ---
-    print("\n--- ETAPA 2: Preparando Modelos LTX-Video ---")
-    if not LTX_VIDEO_REPO_DIR.is_dir():
-        print(f"ERRO: Diretório '{LTX_VIDEO_REPO_DIR}' não encontrado. Execute a clonagem primeiro.")
-        sys.exit(1)
     ltx_config = _load_ltx_config()
-    if ltx_config:
-        _download_ltx_models(ltx_config)
-    else:
         print("ERRO: Não foi possível carregar a configuração do LTX-Video. Abortando.")
         sys.exit(1)
-    # --- ETAPA 3: Baixar Modelos do SeedVR ---
-    print("\n--- ETAPA 3: Preparando Modelos SeedVR ---")
-    _download_seedvr_models()
-    print("\n\n--- Setup do Ambiente Concluído com Sucesso! ---")
-    print("Todos os repositórios e modelos necessários foram verificados e estão prontos.")
-    print("Você agora pode iniciar a aplicação principal.")
 if __name__ == "__main__":
     main()

 #
 # Copyright (C) August 4, 2025  Carlos Rodrigues dos Santos
 #
+# Versão 3.1.0 (Setup Unificado com LTX, SeedVR e VINCIE com Cache Robusto)
+# - Orquestra a instalação de todos os repositórios e modelos para a suíte ADUC-SDR.
+# - Usa snapshot_download para baixar dependências de forma eficiente e correta.
 import os
 import subprocess
 import sys
 from pathlib import Path
 import yaml
+from huggingface_hub import hf_hub_download, snapshot_download
+# ==============================================================================
+# --- CONFIGURAÇÃO DE PATHS E CACHE ---
+# ==============================================================================
+# Assume que /data é um volume persistente montado no contêiner.
 DEPS_DIR = Path("/data")
+CACHE_DIR = DEPS_DIR / ".cache" / "huggingface"
+# --- Paths dos Módulos da Aplicação ---
 LTX_VIDEO_REPO_DIR = DEPS_DIR / "LTX-Video"
+SEEDVR_MODELS_DIR = DEPS_DIR / "models" / "SeedVR"
+VINCIE_REPO_DIR = DEPS_DIR / "VINCIE"
+VINCIE_CKPT_DIR = DEPS_DIR / "ckpt" / "VINCIE-3B"
+# --- Repositórios Git para Clonar ---
 REPOS_TO_CLONE = {
     "LTX-Video": "https://huggingface.co/spaces/Lightricks/ltx-video-distilled",
     "SeedVR": "https://github.com/numz/ComfyUI-SeedVR2_VideoUpscaler",
+    "VINCIE": "https://github.com/ByteDance-Seed/VINCIE",
 }
+# ==============================================================================
+# --- FUNÇÕES AUXILIARES ---
+# ==============================================================================
 def run_command(command, cwd=None):
+    """Executa um comando no terminal de forma segura e com logs claros."""
     print(f"Executando: {' '.join(command)}")
     try:
         subprocess.run(
+            command, check=True, cwd=cwd,
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True,
         )
     except subprocess.CalledProcessError as e:
+        print(f"ERRO: O comando falhou com o código {e.returncode}\nStderr:\n{e.stderr.strip()}")
         sys.exit(1)
     except FileNotFoundError:
+        print(f"ERRO: Comando '{command[0]}' não encontrado. Verifique se o git está instalado.")
         sys.exit(1)
 def _load_ltx_config():
     """Carrega o arquivo de configuração YAML do LTX-Video."""
     print("--- Carregando Configuração do LTX-Video ---")
+    config_file = LTX_VIDEO_REPO_DIR / "configs" / "ltxv-13b-0.9.8-distilled-fp8.yaml"
+    if not config_file.exists():
+        print(f"ERRO: Arquivo de configuração do LTX não encontrado em '{config_file}'")
         return None
+    print(f"Configuração LTX encontrada: {config_file}")
+    with open(config_file, "r") as file:
         return yaml.safe_load(file)
+def _ensure_hf_model(repo_id, filenames=None, allow_patterns=None, local_dir=None):
+    """Função genérica para baixar um ou mais arquivos (hf_hub_download) ou um snapshot (snapshot_download)."""
+    if not repo_id: return
+    print(f"Verificando/Baixando modelo do repositório: '{repo_id}'...")
+    try:
+        if filenames: # Baixa arquivos específicos
+            for filename in filenames:
+                if not filename: continue
                 hf_hub_download(
+                    repo_id=repo_id, filename=filename, cache_dir=str(CACHE_DIR),
+                    local_dir=str(local_dir) if local_dir else None,
+                    #local_dir_use_symlinks=False,
                     token=os.getenv("HF_TOKEN"),
                 )
+        else: # Baixa um snapshot (partes de um repositório)
+            snapshot_download(
+                repo_id=repo_id, cache_dir=str(CACHE_DIR),
+                local_dir=str(local_dir) if local_dir else None,
+                allow_patterns=allow_patterns,
+                token=os.getenv("HF_TOKEN"),
+            )
+        print(f"-> Modelo '{repo_id}' está disponível.")
+    except Exception as e:
+        print(f"ERRO CRÍTICO ao baixar o modelo '{repo_id}': {e}")
+        sys.exit(1)
+# ==============================================================================
+# --- FUNÇÃO PRINCIPAL DE SETUP ---
+# ==============================================================================
 def main():
+    """Orquestra todo o processo de setup do ambiente."""
+    print("--- Iniciando Setup do Ambiente ADUC-SDR (LTX + SeedVR + VINCIE) ---")
     DEPS_DIR.mkdir(exist_ok=True)
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
     # --- ETAPA 1: Clonar Repositórios ---
+    print("\n--- ETAPA 1: Verificando Repositórios Git ---")
     for repo_name, repo_url in REPOS_TO_CLONE.items():
         repo_path = DEPS_DIR / repo_name
+        if repo_path.is_dir():
+            print(f"Repositório '{repo_name}' já existe em '{repo_path}'. Pulando.")
         else:
             print(f"Clonando '{repo_name}' de {repo_url}...")
             run_command(["git", "clone", "--depth", "1", repo_url, str(repo_path)])
+            print(f"-> '{repo_name}' clonado com sucesso.")
+    # --- ETAPA 2: Baixar Modelos LTX-Video e Dependências ---
+    print("\n--- ETAPA 2: Verificando Modelos LTX-Video e Dependências ---")
     ltx_config = _load_ltx_config()
+    if not ltx_config:
         print("ERRO: Não foi possível carregar a configuração do LTX-Video. Abortando.")
         sys.exit(1)
+    _ensure_hf_model(
+        repo_id="Lightricks/LTX-Video",
+        filenames=[
+            ltx_config.get("checkpoint_path"),
+            ltx_config.get("spatial_upscaler_model_path") # <-- Adicione esta linha
+        ]
+    )
+    _ensure_hf_model(
+        repo_id=ltx_config.get("text_encoder_model_name_or_path"),
+        allow_patterns=["*.json", "*.safetensors"]
+    )
+    enhancer_repos = [
+        ltx_config.get("prompt_enhancer_image_caption_model_name_or_path"),
+        ltx_config.get("prompt_enhancer_llm_model_name_or_path"),
+    ]
+    for repo_id in filter(None, enhancer_repos):
+         _ensure_hf_model(repo_id=repo_id, allow_patterns=["*.json", "*.safetensors", "*.bin"])
+    # --- ETAPA 3: Baixar Modelos SeedVR ---
+    print("\n--- ETAPA 3: Verificando Modelos SeedVR ---")
+    SEEDVR_MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    seedvr_files = {
+        "seedvr2_ema_7b_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
+        "seedvr2_ema_7b_sharp_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
+        "ema_vae_fp16.safetensors": "MonsterMMORPG/SeedVR2_SECourses",
+    }
+    for filename, repo_id in seedvr_files.items():
+        if not (SEEDVR_MODELS_DIR / filename).is_file():
+             _ensure_hf_model(repo_id=repo_id, filenames=[filename], local_dir=SEEDVR_MODELS_DIR)
+        else:
+            print(f"Arquivo SeedVR '{filename}' já existe. Pulando.")
+    # --- ETAPA 4: Baixar Modelos VINCIE ---
+    print("\n--- ETAPA 4: Verificando Modelos VINCIE ---")
+    VINCIE_CKPT_DIR.mkdir(parents=True, exist_ok=True)
+    _ensure_hf_model(repo_id="ByteDance-Seed/VINCIE-3B", local_dir=VINCIE_CKPT_DIR)
+    # Cria o symlink de compatibilidade, se necessário
+    repo_ckpt_dir = VINCIE_REPO_DIR / "ckpt"
+    repo_ckpt_dir.mkdir(parents=True, exist_ok=True)
+    link = repo_ckpt_dir / "VINCIE-3B"
+    if not link.exists():
+        link.symlink_to(VINCIE_CKPT_DIR.resolve(), target_is_directory=True)
+        print(f"-> Symlink de compatibilidade VINCIE criado: '{link}' -> '{VINCIE_CKPT_DIR.resolve()}'")
+    else:
+        print(f"-> Symlink de compatibilidade VINCIE já existe.")
+    print("\n\n--- ✅ Setup Completo do Ambiente ADUC-SDR Concluído com Sucesso! ---")
+    print("Todos os repositórios e modelos foram verificados e estão prontos para uso.")
 if __name__ == "__main__":
     main()

start.sh CHANGED Viewed

@@ -1,49 +1,88 @@
-#!/usr/bin/env bash
-set -euo pipefail
-tree -L 4 /app
-tree -L 4 /data
-echo "🚀 Iniciando o script de setup e lançamento do LTX-Video..."
-echo "Usuário atual: $(whoami)"
-# Define as variáveis de ambiente que o LTXServer irá consumir
-export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
-export OUTPUT_ROOT="${OUTPUT_ROOT:-/app/outputs/ltx}"
-export LTXV_FRAME_LOG_EVERY=8
-export LTXV_DEBUG=1
-# --- Garante que Diretórios Existam ---
-mkdir -p "$OUTPUT_ROOT" "$HF_HOME"
-# 1) Builder (garante Apex/Flash e deps CUDA)
-#echo "🛠️  Iniciando o builder.sh para compilar/instalar dependências CUDA..."
-#if [ -f "/app/builder.sh" ]; then
-#  /bin/bash /app/builder.sh
-#  echo "✅ Builder finalizado."
-#else
-#  echo "⚠️  Aviso: builder.sh não encontrado. Pulando etapa de compilação de dependências."
-#fi
-python setup.py
-cp -rfv /app/LTX-Video/ /data/
-export OUTPUT_ROOT="${OUTPUT_ROOT:-/app/outputs}"
-export INPUT_ROOT="${INPUT_ROOT:-/app/inputs}"
-mkdir -p "$OUTPUT_ROOT" "$INPUT_ROOT"
-echo "[aduc][start] Verificando ambiente como usuário: $(whoami)"
-# Env da UI
-export GRADIO_SERVER_NAME="0.0.0.0"
-export GRADIO_SERVER_PORT="${PORT:-7860}"
-export GRADIO_ENABLE_QUEUE="True"
-echo "[ltx][start] Lançando app_ltx.py..."
-# Executa diretamente o python.
-exec python app.py

+#!/bin/bash
+cp /app/LTX-Video/. /data/LTX-Video -rdfv
+tree -L 6 /data
+# ==============================================================================
+# GERENCIAMENTO DE LOGS NA INICIALIZAÇÃO
+# ==============================================================================
+mkdir /data/logs
+LOG_FILE="/data/logs/session.log"
+# Verifica se o arquivo de log da sessão anterior existe e não está vazio
+if [ -f "$LOG_FILE" ] && [ -s "$LOG_FILE" ]; then
+    echo "[STARTUP] Log da sessão anterior encontrado. Preparando para upload."
+    # Cria um nome de arquivo com timestamp para o upload
+    TODAY=$(date +%Y-%m-%d)
+    TIMESTAMP=$(date +%H-%M-%S)
+    UPLOAD_FILENAME="log-${TIMESTAMP}.txt"
+    export REPO_PATH="logs/${TODAY}/${UPLOAD_FILENAME}"
+    # Move o log antigo para um local temporário para evitar que a aplicação comece a escrever nele
+    TEMP_LOG_PATH="/data/previous_session.log"
+    mv "$LOG_FILE" "$TEMP_LOG_PATH"
+    echo "[STARTUP] Fazendo upload de '$TEMP_LOG_PATH' para o repositório em '$REPO_PATH'..."
+    # Executa o script de upload do Python em segundo plano para não bloquear a inicialização
+    # O token HF_TOKEN deve estar definido como uma variável de ambiente no seu contêiner
+    python - <<'PY' &
+import os
+import time
+from huggingface_hub import HfApi, HfFolder
+# Adiciona uma pequena espera para garantir que a rede esteja pronta
+time.sleep(5)
+repo = os.environ.get("SELF_HF_REPO_ID", "eeuuia/Tmp")
+token = os.getenv("HF_TOKEN")
+log_to_upload = "/data/previous_session.log"
+repo_path = os.getenv("REPO_PATH",'logs/log.log')
+if not token:
+    print("[UPLOAD_SCRIPT] AVISO: HF_TOKEN ausente; upload do log desabilitado.")
+    # Limpa o arquivo temporário mesmo assim
+    if os.path.exists(log_to_upload):
+        os.remove(log_to_upload)
+    exit()
+if not repo_path:
+    print("[UPLOAD_SCRIPT] ERRO: REPO_PATH não definido.")
+    exit()
+try:
+    print(f"[UPLOAD_SCRIPT] Iniciando upload para {repo}...")
+    api = HfApi(token=token)
+    api.upload_file(
+        path_or_fileobj=log_to_upload,
+        path_in_repo=repo_path,
+        repo_id=repo,
+        repo_type="model",
+    )
+    print(f"[UPLOAD_SCRIPT] Upload de log concluído com sucesso para: {repo_path}")
+finally:
+    # Garante que o arquivo de log temporário seja sempre removido após a tentativa de upload
+    if os.path.exists(log_to_upload):
+        os.remove(log_to_upload)
+        print("[UPLOAD_SCRIPT] Arquivo de log temporário limpo.")
+PY
+else
+    echo "[STARTUP] Nenhum log da sessão anterior encontrado. Iniciando com um log limpo."
+fi
+# ==============================================================================
+# INICIALIZAÇÃO DA APLICAÇÃO PRINCIPAL
+# ==============================================================================
+echo "[STARTUP] Iniciando a aplicação principal Gradio (app.py)..."
+# Executa o setup.py primeiro para garantir que as dependências estão prontas
+python /app/setup.py
+# Inicia a aplicação Gradio
+# O `exec` substitui o processo do shell pelo processo do python,
+# o que é uma boa prática para scripts de inicialização de contêineres.
+exec python /app/app.py