ACE-Step

Sleeping

App Files Files Community

Sayoyo commited on May 13

Commit

64750a4

1 Parent(s): 8a8cb3e

update examles

Browse files

Files changed (4) hide show

examples/default/input_params/output_20250426091716_0_input_params.json +25 -0
examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json +1 -1
examples/zh_rap_lora/input_params/output_20250512160830_0_input_params.json +0 -45
pipeline_ace_step.py +708 -215

examples/default/input_params/output_20250426091716_0_input_params.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "prompt": "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted",
+    "lyrics": "[Chorus]\nねぇ、顔が赤いよ？\nどうしたの？ 熱があるの？\nそれとも怒ってるの？\nねぇ、言ってよ！\n\nどうしてそんな目で見るの？\n私、悪いことした？\n何か間違えたの？\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge]\n目を閉じて、くるっと背を向けて、\n何も見なかったフリするから、\n怒らないで… 許してよ…\n\n[Chorus]\nねぇ、顔が赤いよ？\nどうしたの？ 熱があるの？\nそれとも怒ってるの？\nねぇ、言ってよ！\n\nどうしてそんな目で見るの？\n私、悪いことした？\n何か間違えたの？\nお願い、やめて… 怖いから…\nだから、やめてよ…\n\n[Bridge 2]\n待って、もし私が悪いなら、\nごめんなさいって言うから、\nアイスクリームあげるから、\nもう怒らないで？\n\nOoooh… 言ってよ！",
+    "audio_duration": 160,
+    "infer_step": 60,
+    "guidance_scale": 15,
+    "scheduler_type": "euler",
+    "cfg_type": "apg",
+    "omega_scale": 10,
+    "guidance_interval": 0.5,
+    "guidance_interval_decay": 0,
+    "min_guidance_scale": 3,
+    "use_erg_tag": true,
+    "use_erg_lyric": true,
+    "use_erg_diffusion": true,
+    "oss_steps": [],
+    "timecosts": {
+        "preprocess": 0.0282442569732666,
+        "diffusion": 12.104875326156616,
+        "latent2audio": 1.587641954421997
+    },
+    "actual_seeds": [
+        4028738662
+    ]
+}

examples/zh_rap_lora/input_params/output_20250512120348_0_input_params.json CHANGED Viewed

@@ -22,7 +22,7 @@
         "latent2audio": 0.5694489479064941
     },
     "actual_seeds": [
-        721655639
     ],
     "retake_seeds": [
         1603201617

         "latent2audio": 0.5694489479064941
     },
     "actual_seeds": [
+        226581098
     ],
     "retake_seeds": [
         1603201617

examples/zh_rap_lora/input_params/output_20250512160830_0_input_params.json DELETED Viewed

@@ -1,45 +0,0 @@
-{
-    "lora_name_or_path": "/root/sag_train/data/ace_step_v1_chinese_rap_lora_80k",
-    "task": "text2music",
-    "prompt": "articulate, spoken word, young adult, rap music, male, clear, energetic, warm, relaxed, breathy, night club",
-    "lyrics": "[verse]\n这 这 谁 又 在 派 对 喝 多\n我 的 脑 袋\n像 被 驴 踢 过\n不 对 劲\n舌 头 打 结 不 会 说\n你 来 挑 战 我 就 跪\n开 局 直 接 崩 溃\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！\n\n[verse]\n这 这 谁 又 在 派 对 丢 人\n我 的 世 界\n已 经 彻 底 崩 溃\n没 有 完 美\n只 有 翻 车 现 场\n以 及 观 众 的 嘲 讽\n\n[chorus]\n就 咪 乱 咪 念 咪 错 咪\n嘴 咪 瓢 咪 成 咪 狗 咪\n脑 咪 袋 咪 像 咪 浆 咪 糊 咪\n跟 咪 着 咪 节 咪 奏 咪\n把 咪 歌 咪 词 咪 全 咪 忘 咪\n一 咪 张 咪 嘴 咪 就 咪 废 咪\n只 咪 剩 咪 下 咪 尴 咪 尬 咪 回 咪 忆\n草！\n\n[verse]\n错 错 错 错 了\n一 口 气 全 念 错\n错 错 错 错 了\n舌 头 打 结 甩 锅\n甩 甩 甩 甩 锅\n甩 锅 甩 锅\n拍 子 全 部 乱 套\n观 众 笑 到 吐 血\n\n[verse]\n你 的 歌 词 我 的 噩 梦\n唱 完 直 接 社 死\n调 跑 到 外 太 空\n观 众 表 情 裂 开\n你 笑 我 菜\n我 笑 你 不 懂\n这 叫 艺 术 表 演\n不 服 你 来！",
-    "audio_duration": 169.12,
-    "infer_step": 60,
-    "guidance_scale": 15,
-    "scheduler_type": "euler",
-    "cfg_type": "apg",
-    "omega_scale": 10,
-    "guidance_interval": 0.5,
-    "guidance_interval_decay": 0,
-    "min_guidance_scale": 3,
-    "use_erg_tag": true,
-    "use_erg_lyric": false,
-    "use_erg_diffusion": true,
-    "oss_steps": [],
-    "timecosts": {
-        "preprocess": 0.041605472564697266,
-        "diffusion": 14.009192705154419,
-        "latent2audio": 1.55946946144104
-    },
-    "actual_seeds": [
-        547563805
-    ],
-    "retake_seeds": [
-        2702917060
-    ],
-    "retake_variance": 0.5,
-    "guidance_scale_text": 0,
-    "guidance_scale_lyric": 0,
-    "repaint_start": 0,
-    "repaint_end": 0,
-    "edit_n_min": 0.0,
-    "edit_n_max": 1.0,
-    "edit_n_avg": 1,
-    "src_audio_path": null,
-    "edit_target_prompt": null,
-    "edit_target_lyrics": null,
-    "audio2audio_enable": false,
-    "ref_audio_strength": 0.5,
-    "ref_audio_input": null,
-    "audio_path": "./outputs/output_20250512160830_0.wav"
-}

pipeline_ace_step.py CHANGED Viewed

@@ -12,9 +12,15 @@ import math
 from huggingface_hub import hf_hub_download, snapshot_download
 # from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
-from schedulers.scheduling_flow_match_heun_discrete import FlowMatchHeunDiscreteScheduler
-from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
 from diffusers.utils.torch_utils import randn_tensor
 from transformers import UMT5EncoderModel, AutoTokenizer
@@ -22,23 +28,42 @@ from language_segmentation import LangSegment
 from music_dcae.music_dcae_pipeline import MusicDCAE
 from models.ace_step_transformer import ACEStepTransformer2DModel
 from models.lyrics_utils.lyric_tokenizer import VoiceBpeTokenizer
-from apg_guidance import apg_forward, MomentumBuffer, cfg_forward, cfg_zero_star, cfg_double_condition_forward
 import torchaudio
 import torio
 torch.backends.cudnn.benchmark = False
-torch.set_float32_matmul_precision('high')
 torch.backends.cudnn.deterministic = True
 torch.backends.cuda.matmul.allow_tf32 = True
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 SUPPORT_LANGUAGES = {
-    "en": 259, "de": 260, "fr": 262, "es": 284, "it": 285,
-    "pt": 286, "pl": 294, "tr": 295, "ru": 267, "cs": 293,
-    "nl": 297, "ar": 5022, "zh": 5023, "ja": 5412, "hu": 5753,
-    "ko": 6152, "hi": 6680
 }
 structure_pattern = re.compile(r"\[.*?\]")
@@ -56,7 +81,16 @@ REPO_ID = "ACE-Step/ACE-Step-v1-3.5B"
 # class ACEStepPipeline(DiffusionPipeline):
 class ACEStepPipeline:
-    def __init__(self, checkpoint_dir=None, device_id=0, dtype="bfloat16", text_encoder_checkpoint_path=None, persistent_storage_path=None, torch_compile=False, **kwargs):
         if not checkpoint_dir:
             if persistent_storage_path is None:
                 checkpoint_dir = os.path.join(os.path.dirname(__file__), "checkpoints")
@@ -64,7 +98,11 @@ class ACEStepPipeline:
                 checkpoint_dir = os.path.join(persistent_storage_path, "checkpoints")
         ensure_directory_exists(checkpoint_dir)
         self.checkpoint_dir = checkpoint_dir
-        device = torch.device(f"cuda:{device_id}") if torch.cuda.is_available() else torch.device("cpu")
         if device.type == "cpu" and torch.backends.mps.is_available():
             device = torch.device("mps")
         self.dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float32
@@ -74,17 +112,25 @@ class ACEStepPipeline:
         self.loaded = False
         self.torch_compile = torch_compile
         self.lora_path = "none"
     def load_lora(self, lora_name_or_path):
         if lora_name_or_path != self.lora_path and lora_name_or_path != "none":
             if not os.path.exists(lora_name_or_path):
-                lora_download_path = snapshot_download(lora_name_or_path, cache_dir=self.checkpoint_dir)
             else:
                 lora_download_path = lora_name_or_path
             if self.lora_path != "none":
                 self.ace_step_transformer.unload_lora()
-            self.ace_step_transformer.load_lora_adapter(os.path.join(lora_download_path, "pytorch_lora_weights.safetensors"), adapter_name="zh_rap_lora", with_alpha=True)
-            logger.info(f"Loading lora weights from: {lora_name_or_path} download path is: {lora_download_path}")
             self.lora_path = lora_name_or_path
         elif self.lora_path != "none" and lora_name_or_path == "none":
             logger.info("No lora weights to load.")
@@ -99,55 +145,124 @@ class ACEStepPipeline:
         text_encoder_model_path = os.path.join(checkpoint_dir, "umt5-base")
         files_exist = (
-            os.path.exists(os.path.join(dcae_model_path, "config.json")) and
-            os.path.exists(os.path.join(dcae_model_path, "diffusion_pytorch_model.safetensors")) and
-            os.path.exists(os.path.join(vocoder_model_path, "config.json")) and
-            os.path.exists(os.path.join(vocoder_model_path, "diffusion_pytorch_model.safetensors")) and
-            os.path.exists(os.path.join(ace_step_model_path, "config.json")) and
-            os.path.exists(os.path.join(ace_step_model_path, "diffusion_pytorch_model.safetensors")) and
-            os.path.exists(os.path.join(text_encoder_model_path, "config.json")) and
-            os.path.exists(os.path.join(text_encoder_model_path, "model.safetensors")) and
-            os.path.exists(os.path.join(text_encoder_model_path, "special_tokens_map.json")) and
-            os.path.exists(os.path.join(text_encoder_model_path, "tokenizer_config.json")) and
-            os.path.exists(os.path.join(text_encoder_model_path, "tokenizer.json"))
         )
         if not files_exist:
-            logger.info(f"Checkpoint directory {checkpoint_dir} is not complete, downloading from Hugging Face Hub")
             # download music dcae model
             os.makedirs(dcae_model_path, exist_ok=True)
-            hf_hub_download(repo_id=REPO_ID, subfolder="music_dcae_f8c8",
-                            filename="config.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="music_dcae_f8c8",
-                            filename="diffusion_pytorch_model.safetensors", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
             # download vocoder model
             os.makedirs(vocoder_model_path, exist_ok=True)
-            hf_hub_download(repo_id=REPO_ID, subfolder="music_vocoder",
-                            filename="config.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="music_vocoder",
-                            filename="diffusion_pytorch_model.safetensors", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
             # download ace_step transformer model
             os.makedirs(ace_step_model_path, exist_ok=True)
-            hf_hub_download(repo_id=REPO_ID, subfolder="ace_step_transformer",
-                            filename="config.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="ace_step_transformer",
-                            filename="diffusion_pytorch_model.safetensors", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
             # download text encoder model
             os.makedirs(text_encoder_model_path, exist_ok=True)
-            hf_hub_download(repo_id=REPO_ID, subfolder="umt5-base",
-                            filename="config.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="umt5-base",
-                            filename="model.safetensors", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="umt5-base",
-                            filename="special_tokens_map.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="umt5-base",
-                            filename="tokenizer_config.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
-            hf_hub_download(repo_id=REPO_ID, subfolder="umt5-base",
-                            filename="tokenizer.json", local_dir=checkpoint_dir, local_dir_use_symlinks=False)
             logger.info("Models downloaded")
@@ -156,29 +271,131 @@ class ACEStepPipeline:
         ace_step_checkpoint_path = ace_step_model_path
         text_encoder_checkpoint_path = text_encoder_model_path
-        self.music_dcae = MusicDCAE(dcae_checkpoint_path=dcae_checkpoint_path, vocoder_checkpoint_path=vocoder_checkpoint_path)
         self.music_dcae.to(device).eval().to(self.dtype)
-        self.ace_step_transformer = ACEStepTransformer2DModel.from_pretrained(ace_step_checkpoint_path, torch_dtype=self.dtype)
         self.ace_step_transformer.to(device).eval().to(self.dtype)
         lang_segment = LangSegment()
-        lang_segment.setfilters([
-            'af', 'am', 'an', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'dz', 'el',
-            'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'ga', 'gl', 'gu', 'he', 'hi', 'hr', 'ht', 'hu', 'hy',
-            'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
-            'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'nb', 'ne', 'nl', 'nn', 'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu',
-            'ro', 'ru', 'rw', 'se', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk',
-            'ur', 'vi', 'vo', 'wa', 'xh', 'zh', 'zu'
-        ])
         self.lang_segment = lang_segment
         self.lyric_tokenizer = VoiceBpeTokenizer()
-        text_encoder_model = UMT5EncoderModel.from_pretrained(text_encoder_checkpoint_path, torch_dtype=self.dtype).eval()
         text_encoder_model = text_encoder_model.to(device).to(self.dtype)
         text_encoder_model.requires_grad_(False)
         self.text_encoder_model = text_encoder_model
-        self.text_tokenizer = AutoTokenizer.from_pretrained(text_encoder_checkpoint_path)
         self.loaded = True
         # compile
@@ -188,7 +405,13 @@ class ACEStepPipeline:
             self.text_encoder_model = torch.compile(self.text_encoder_model)
     def get_text_embeddings(self, texts, device, text_max_length=256):
-        inputs = self.text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=text_max_length)
         inputs = {key: value.to(device) for key, value in inputs.items()}
         if self.text_encoder_model.device != device:
             self.text_encoder_model.to(device)
@@ -197,62 +420,87 @@ class ACEStepPipeline:
             last_hidden_states = outputs.last_hidden_state
         attention_mask = inputs["attention_mask"]
         return last_hidden_states, attention_mask
-    def get_text_embeddings_null(self, texts, device, text_max_length=256, tau=0.01, l_min=8, l_max=10):
-        inputs = self.text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=text_max_length)
         inputs = {key: value.to(device) for key, value in inputs.items()}
         if self.text_encoder_model.device != device:
             self.text_encoder_model.to(device)
         def forward_with_temperature(inputs, tau=0.01, l_min=8, l_max=10):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
-                handler = self.text_encoder_model.encoder.block[i].layer[0].SelfAttention.q.register_forward_hook(hook)
                 handlers.append(handler)
             with torch.no_grad():
                 outputs = self.text_encoder_model(**inputs)
                 last_hidden_states = outputs.last_hidden_state
             for hook in handlers:
                 hook.remove()
             return last_hidden_states
         last_hidden_states = forward_with_temperature(inputs, tau, l_min, l_max)
         return last_hidden_states
     def set_seeds(self, batch_size, manual_seeds=None):
-        seeds = None
         if manual_seeds is not None:
             if isinstance(manual_seeds, str):
                 if "," in manual_seeds:
-                    seeds = list(map(int, manual_seeds.split(",")))
                 elif manual_seeds.isdigit():
-                    seeds = int(manual_seeds)
-        random_generators = [torch.Generator(device=self.device) for _ in range(batch_size)]
         actual_seeds = []
         for i in range(batch_size):
-            seed = None
-            if seeds is None:
-                seed = torch.randint(0, 2**32, (1,)).item()
-            if isinstance(seeds, int):
-                seed = seeds
-            if isinstance(seeds, list):
-                seed = seeds[i]
-            random_generators[i].manual_seed(seed)
-            actual_seeds.append(seed)
         return random_generators, actual_seeds
     def get_lang(self, text):
         language = "en"
-        try:
             _ = self.lang_segment.getTexts(text)
             langCounts = self.lang_segment.getCounts()
             language = langCounts[0][0]
@@ -286,7 +534,9 @@ class ACEStepPipeline:
                 else:
                     token_idx = self.lyric_tokenizer.encode(line, lang)
                 if debug:
-                    toks = self.lyric_tokenizer.batch_decode([[tok_id] for tok_id in token_idx])
                     logger.info(f"debbug {line} --> {lang} --> {toks}")
                 lyric_token_idx = lyric_token_idx + token_idx + [2]
             except Exception as e:
@@ -315,11 +565,13 @@ class ACEStepPipeline:
         attention_mask=None,
         momentum_buffer=None,
         momentum_buffer_tar=None,
-        return_src_pred=True
     ):
         noise_pred_src = None
         if return_src_pred:
-            src_latent_model_input = torch.cat([zt_src, zt_src]) if do_classifier_free_guidance else zt_src
             timestep = t.expand(src_latent_model_input.shape[0])
             # source
             noise_pred_src = self.ace_step_transformer(
@@ -334,7 +586,9 @@ class ACEStepPipeline:
             ).sample
             if do_classifier_free_guidance:
-                noise_pred_with_cond_src, noise_pred_uncond_src = noise_pred_src.chunk(2)
                 if cfg_type == "apg":
                     noise_pred_src = apg_forward(
                         pred_cond=noise_pred_with_cond_src,
@@ -349,7 +603,9 @@ class ACEStepPipeline:
                         cfg_strength=guidance_scale,
                     )
-        tar_latent_model_input = torch.cat([zt_tar, zt_tar]) if do_classifier_free_guidance else zt_tar
         timestep = t.expand(tar_latent_model_input.shape[0])
         # target
         noise_pred_tar = self.ace_step_transformer(
@@ -419,26 +675,52 @@ class ACEStepPipeline:
         T_steps = infer_steps
         frame_length = src_latents.shape[-1]
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
-        timesteps, T_steps = retrieve_timesteps(scheduler, T_steps, device, timesteps=None)
         if do_classifier_free_guidance:
             attention_mask = torch.cat([attention_mask] * 2, dim=0)
-            encoder_text_hidden_states = torch.cat([encoder_text_hidden_states, torch.zeros_like(encoder_text_hidden_states)], 0)
             text_attention_mask = torch.cat([text_attention_mask] * 2, dim=0)
-            target_encoder_text_hidden_states = torch.cat([target_encoder_text_hidden_states, torch.zeros_like(target_encoder_text_hidden_states)], 0)
-            target_text_attention_mask = torch.cat([target_text_attention_mask] * 2, dim=0)
-            speaker_embds = torch.cat([speaker_embds, torch.zeros_like(speaker_embds)], 0)
-            target_speaker_embeds = torch.cat([target_speaker_embeds, torch.zeros_like(target_speaker_embeds)], 0)
-            lyric_token_ids = torch.cat([lyric_token_ids, torch.zeros_like(lyric_token_ids)], 0)
             lyric_mask = torch.cat([lyric_mask, torch.zeros_like(lyric_mask)], 0)
-            target_lyric_token_ids = torch.cat([target_lyric_token_ids, torch.zeros_like(target_lyric_token_ids)], 0)
-            target_lyric_mask = torch.cat([target_lyric_mask, torch.zeros_like(target_lyric_mask)], 0)
         momentum_buffer = MomentumBuffer()
         momentum_buffer_tar = MomentumBuffer()
@@ -455,10 +737,10 @@ class ACEStepPipeline:
             if i < n_min:
                 continue
-            t_i = t/1000
-            if i+1 < len(timesteps):
-                t_im1 = (timesteps[i+1])/1000
             else:
                 t_im1 = torch.zeros_like(t_i).to(t_i.device)
@@ -466,7 +748,12 @@ class ACEStepPipeline:
                 # Calculate the average of the V predictions
                 V_delta_avg = torch.zeros_like(x_src)
                 for k in range(n_avg):
-                    fwd_noise = randn_tensor(shape=x_src.shape, generator=random_generators, device=device, dtype=dtype)
                     zt_src = (1 - t_i) * x_src + (t_i) * fwd_noise
@@ -490,22 +777,29 @@ class ACEStepPipeline:
                         guidance_scale=guidance_scale,
                         target_guidance_scale=target_guidance_scale,
                         attention_mask=attention_mask,
-                        momentum_buffer=momentum_buffer
                     )
-                    V_delta_avg += (1 / n_avg) * (Vt_tar - Vt_src) # - (hfg-1)*( x_src))
                 # propagate direct ODE
                 zt_edit = zt_edit.to(torch.float32)
                 zt_edit = zt_edit + (t_im1 - t_i) * V_delta_avg
                 zt_edit = zt_edit.to(V_delta_avg.dtype)
-            else: # i >= T_steps-n_min # regular sampling for last n_min steps
                 if i == n_max:
-                    fwd_noise = randn_tensor(shape=x_src.shape, generator=random_generators, device=device, dtype=dtype)
                     scheduler._init_step_index(t)
                     sigma = scheduler.sigmas[scheduler.step_index]
                     xt_src = sigma * fwd_noise + (1.0 - sigma) * x_src
                     xt_tar = zt_edit + xt_src - x_src
                 _, Vt_tar = self.calc_v(
                     zt_src=None,
                     zt_tar=xt_tar,
@@ -527,13 +821,13 @@ class ACEStepPipeline:
                     momentum_buffer_tar=momentum_buffer_tar,
                     return_src_pred=False,
                 )
                 dtype = Vt_tar.dtype
                 xt_tar = xt_tar.to(torch.float32)
                 prev_sample = xt_tar + (t_im1 - t_i) * Vt_tar
-                prev_sample = prev_sample.to(dtype)
                 xt_tar = prev_sample
         target_latents = zt_edit if xt_tar is None else xt_tar
         return target_latents
@@ -551,7 +845,12 @@ class ACEStepPipeline:
         timesteps = scheduler.timesteps.unsqueeze(1).to(gt_latents.dtype)
         indices = indices.to(timesteps.device).to(gt_latents.dtype).unsqueeze(1)
         nearest_idx = torch.argmin(torch.cdist(indices, timesteps), dim=1)
-        sigma = scheduler.sigmas[nearest_idx].flatten().to(gt_latents.device).to(gt_latents.dtype)
         while len(sigma.shape) < gt_latents.ndim:
             sigma = sigma.unsqueeze(-1)
         noisy_image = sigma * noise + (1.0 - sigma) * gt_latents
@@ -595,15 +894,30 @@ class ACEStepPipeline:
         ref_latents=None,
     ):
-        logger.info("cfg_type: {}, guidance_scale: {}, omega_scale: {}".format(cfg_type, guidance_scale, omega_scale))
         do_classifier_free_guidance = True
         if guidance_scale == 0.0 or guidance_scale == 1.0:
             do_classifier_free_guidance = False
         do_double_condition_guidance = False
-        if guidance_scale_text is not None and guidance_scale_text > 1.0 and guidance_scale_lyric is not None and guidance_scale_lyric > 1.0:
             do_double_condition_guidance = True
-            logger.info("do_double_condition_guidance: {}, guidance_scale_text: {}, guidance_scale_lyric: {}".format(do_double_condition_guidance, guidance_scale_text, guidance_scale_lyric))
         device = encoder_text_hidden_states.device
         dtype = encoder_text_hidden_states.dtype
@@ -619,7 +933,7 @@ class ACEStepPipeline:
                 num_train_timesteps=1000,
                 shift=3.0,
             )
         frame_length = int(duration * 44100 / 512 / 8)
         if src_latents is not None:
             frame_length = src_latents.shape[-1]
@@ -630,31 +944,60 @@ class ACEStepPipeline:
         if len(oss_steps) > 0:
             infer_steps = max(oss_steps)
             scheduler.set_timesteps
-            timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=infer_steps, device=device, timesteps=None)
             new_timesteps = torch.zeros(len(oss_steps), dtype=dtype, device=device)
             for idx in range(len(oss_steps)):
-                new_timesteps[idx] = timesteps[oss_steps[idx]-1]
             num_inference_steps = len(oss_steps)
             sigmas = (new_timesteps / 1000).float().cpu().numpy()
-            timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=num_inference_steps, device=device, sigmas=sigmas)
-            logger.info(f"oss_steps: {oss_steps}, num_inference_steps: {num_inference_steps} after remapping to timesteps {timesteps}")
         else:
-            timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=infer_steps, device=device, timesteps=None)
-        target_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=random_generators, device=device, dtype=dtype)
         is_repaint = False
-        is_extend  = False
         if add_retake_noise:
             n_min = int(infer_steps * (1 - retake_variance))
-            retake_variance = torch.tensor(retake_variance * math.pi/2).to(device).to(dtype)
-            retake_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=retake_random_generators, device=device, dtype=dtype)
             repaint_start_frame = int(repaint_start * 44100 / 512 / 8)
             repaint_end_frame = int(repaint_end * 44100 / 512 / 8)
             x0 = src_latents
             # retake
-            is_repaint = (repaint_end_frame - repaint_start_frame != frame_length)
             is_extend = (repaint_start_frame < 0) or (repaint_end_frame > frame_length)
             if is_extend:
                 is_repaint = True
@@ -662,13 +1005,23 @@ class ACEStepPipeline:
             # TODO: train a mask aware repainting controlnet
             # to make sure mean = 0, std = 1
             if not is_repaint:
-                target_latents = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
             elif not is_extend:
-                # if repaint_end_frame
-                repaint_mask = torch.zeros((bsz, 8, 16, frame_length), device=device, dtype=dtype)
                 repaint_mask[:, :, :, repaint_start_frame:repaint_end_frame] = 1.0
-                repaint_noise = torch.cos(retake_variance) * target_latents + torch.sin(retake_variance) * retake_latents
-                repaint_noise = torch.where(repaint_mask == 1.0, repaint_noise, target_latents)
                 zt_edit = x0.clone()
                 z0 = repaint_noise
             elif is_extend:
@@ -684,73 +1037,107 @@ class ACEStepPipeline:
                 if repaint_start_frame < 0:
                     left_pad_frame_length = abs(repaint_start_frame)
                     frame_length = left_pad_frame_length + gt_latents.shape[-1]
-                    extend_gt_latents = torch.nn.functional.pad(gt_latents, (left_pad_frame_length, 0), "constant", 0)
                     if frame_length > max_infer_fame_length:
                         right_trim_length = frame_length - max_infer_fame_length
-                        extend_gt_latents = extend_gt_latents[:,:,:,:max_infer_fame_length]
-                        to_right_pad_gt_latents = extend_gt_latents[:,:,:,-right_trim_length:]
                         frame_length = max_infer_fame_length
                     repaint_start_frame = 0
                     gt_latents = extend_gt_latents
                 if repaint_end_frame > src_latents_length:
                     right_pad_frame_length = repaint_end_frame - gt_latents.shape[-1]
                     frame_length = gt_latents.shape[-1] + right_pad_frame_length
-                    extend_gt_latents = torch.nn.functional.pad(gt_latents, (0, right_pad_frame_length), "constant", 0)
                     if frame_length > max_infer_fame_length:
                         left_trim_length = frame_length - max_infer_fame_length
-                        extend_gt_latents = extend_gt_latents[:,:,:,-max_infer_fame_length:]
-                        to_left_pad_gt_latents = extend_gt_latents[:,:,:,:left_trim_length]
                         frame_length = max_infer_fame_length
                     repaint_end_frame = frame_length
                     gt_latents = extend_gt_latents
-                repaint_mask = torch.zeros((bsz, 8, 16, frame_length), device=device, dtype=dtype)
                 if left_pad_frame_length > 0:
-                    repaint_mask[:,:,:,:left_pad_frame_length] = 1.0
                 if right_pad_frame_length > 0:
-                    repaint_mask[:,:,:,-right_pad_frame_length:] = 1.0
                 x0 = gt_latents
                 padd_list = []
                 if left_pad_frame_length > 0:
                     padd_list.append(retake_latents[:, :, :, :left_pad_frame_length])
-                padd_list.append(target_latents[:,:,:,left_trim_length:target_latents.shape[-1]-right_trim_length])
                 if right_pad_frame_length > 0:
                     padd_list.append(retake_latents[:, :, :, -right_pad_frame_length:])
                 target_latents = torch.cat(padd_list, dim=-1)
-                assert target_latents.shape[-1] == x0.shape[-1], f"{target_latents.shape=} {x0.shape=}"
                 zt_edit = x0.clone()
                 z0 = target_latents
         init_timestep = 1000
         if audio2audio_enable and ref_latents is not None:
-            target_latents, init_timestep = self.add_latents_noise(gt_latents=ref_latents, variance=(1-ref_audio_strength), noise=target_latents, scheduler=scheduler)
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
         # guidance interval
         start_idx = int(num_inference_steps * ((1 - guidance_interval) / 2))
         end_idx = int(num_inference_steps * (guidance_interval / 2 + 0.5))
-        logger.info(f"start_idx: {start_idx}, end_idx: {end_idx}, num_inference_steps: {num_inference_steps}")
         momentum_buffer = MomentumBuffer()
         def forward_encoder_with_temperature(self, inputs, tau=0.01, l_min=4, l_max=6):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
-                handler = self.ace_step_transformer.lyric_encoder.encoders[i].self_attn.linear_q.register_forward_hook(hook)
                 handlers.append(handler)
-            encoder_hidden_states, encoder_hidden_mask = self.ace_step_transformer.encode(**inputs)
             for hook in handlers:
                 hook.remove()
             return encoder_hidden_states
         # P(speaker, text, lyric)
@@ -767,12 +1154,16 @@ class ACEStepPipeline:
             encoder_hidden_states_null = forward_encoder_with_temperature(
                 self,
                 inputs={
-                    "encoder_text_hidden_states": encoder_text_hidden_states_null if encoder_text_hidden_states_null is not None else torch.zeros_like(encoder_text_hidden_states),
                     "text_attention_mask": text_attention_mask,
                     "speaker_embeds": torch.zeros_like(speaker_embds),
                     "lyric_token_idx": lyric_token_ids,
                     "lyric_mask": lyric_mask,
-                }
             )
         else:
             # P(null_speaker, null_text, null_lyric)
@@ -783,7 +1174,7 @@ class ACEStepPipeline:
                 torch.zeros_like(lyric_token_ids),
                 lyric_mask,
             )
         encoder_hidden_states_no_lyric = None
         if do_double_condition_guidance:
             # P(null_speaker, text, lyric_weaker)
@@ -796,7 +1187,7 @@ class ACEStepPipeline:
                         "speaker_embeds": torch.zeros_like(speaker_embds),
                         "lyric_token_idx": lyric_token_ids,
                         "lyric_mask": lyric_mask,
-                    }
                 )
             # P(null_speaker, text, no_lyric)
             else:
@@ -808,26 +1199,34 @@ class ACEStepPipeline:
                     lyric_mask,
                 )
-        def forward_diffusion_with_temperature(self, hidden_states, timestep, inputs, tau=0.01, l_min=15, l_max=20):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
-                handler = self.ace_step_transformer.transformer_blocks[i].attn.to_q.register_forward_hook(hook)
                 handlers.append(handler)
-                handler = self.ace_step_transformer.transformer_blocks[i].cross_attn.to_q.register_forward_hook(hook)
                 handlers.append(handler)
-            sample = self.ace_step_transformer.decode(hidden_states=hidden_states, timestep=timestep, **inputs).sample
             for hook in handlers:
                 hook.remove()
             return sample
         for i, t in tqdm(enumerate(timesteps), total=num_inference_steps):
             if t > init_timestep:
@@ -850,8 +1249,15 @@ class ACEStepPipeline:
                 # compute current guidance scale
                 if guidance_interval_decay > 0:
                     # Linearly interpolate to calculate the current guidance scale
-                    progress = (i - start_idx) / (end_idx - start_idx - 1)  # 归一化到[0,1]
-                    current_guidance_scale = guidance_scale - (guidance_scale - min_guidance_scale) * progress * guidance_interval_decay
                 else:
                     current_guidance_scale = guidance_scale
@@ -869,7 +1275,10 @@ class ACEStepPipeline:
                 ).sample
                 noise_pred_with_only_text_cond = None
-                if do_double_condition_guidance and encoder_hidden_states_no_lyric is not None:
                     noise_pred_with_only_text_cond = self.ace_step_transformer.decode(
                         hidden_states=latent_model_input,
                         attention_mask=attention_mask,
@@ -901,7 +1310,10 @@ class ACEStepPipeline:
                         timestep=timestep,
                     ).sample
-                if do_double_condition_guidance and noise_pred_with_only_text_cond is not None:
                     noise_pred = cfg_double_condition_forward(
                         cond_output=noise_pred_with_cond,
                         uncond_output=noise_pred_uncond,
@@ -930,7 +1342,7 @@ class ACEStepPipeline:
                         guidance_scale=current_guidance_scale,
                         i=i,
                         zero_steps=zero_steps,
-                        use_zero_init=use_zero_init
                     )
             else:
                 latent_model_input = latents
@@ -945,9 +1357,9 @@ class ACEStepPipeline:
                 ).sample
             if is_repaint and i >= n_min:
-                t_i = t/1000
-                if i+1 < len(timesteps):
-                    t_im1 = (timesteps[i+1])/1000
                 else:
                     t_im1 = torch.zeros_like(t_i).to(t_i.device)
                 dtype = noise_pred.dtype
@@ -956,18 +1368,37 @@ class ACEStepPipeline:
                 prev_sample = prev_sample.to(dtype)
                 target_latents = prev_sample
                 zt_src = (1 - t_im1) * x0 + (t_im1) * z0
-                target_latents = torch.where(repaint_mask == 1.0, target_latents, zt_src)
             else:
-                target_latents = scheduler.step(model_output=noise_pred, timestep=t, sample=target_latents, return_dict=False, omega=omega_scale)[0]
         if is_extend:
             if to_right_pad_gt_latents is not None:
-                target_latents = torch.cat([target_latents, to_right_pad_gt_latents], dim=-1)
             if to_left_pad_gt_latents is not None:
-                target_latents = torch.cat([to_right_pad_gt_latents, target_latents], dim=0)
         return target_latents
-    def latents2audio(self, latents, target_wav_duration_second=30, sample_rate=48000, save_path=None, format="mp3"):
         output_audio_paths = []
         bs = latents.shape[0]
         audio_lengths = [target_wav_duration_second * sample_rate] * bs
@@ -976,11 +1407,15 @@ class ACEStepPipeline:
             _, pred_wavs = self.music_dcae.decode(pred_latents, sr=sample_rate)
         pred_wavs = [pred_wav.cpu().float() for pred_wav in pred_wavs]
         for i in tqdm(range(bs)):
-            output_audio_path = self.save_wav_file(pred_wavs[i], i, sample_rate=sample_rate)
             output_audio_paths.append(output_audio_path)
         return output_audio_paths
-    def save_wav_file(self, target_wav, idx, save_path=None, sample_rate=48000, format="mp3"):
         if save_path is None:
             logger.warning("save_path is None, using default path ./outputs/")
             base_path = f"./outputs"
@@ -989,9 +1424,17 @@ class ACEStepPipeline:
             base_path = save_path
             ensure_directory_exists(base_path)
-        output_path_flac = f"{base_path}/output_{time.strftime('%Y%m%d%H%M%S')}_{idx}.{format}"
         target_wav = target_wav.float()
-        torchaudio.save(output_path_flac, target_wav, sample_rate=sample_rate, format=format, compression=torio.io.CodecConfig(bit_rate=320000))
         return output_path_flac
     def infer_latents(self, input_audio_path):
@@ -1017,7 +1460,7 @@ class ACEStepPipeline:
         omega_scale: int = 10.0,
         manual_seeds: list = None,
         guidance_interval: float = 0.5,
-        guidance_interval_decay: float = 0.,
         min_guidance_scale: float = 3.0,
         use_erg_tag: bool = True,
         use_erg_lyric: bool = True,
@@ -1060,22 +1503,30 @@ class ACEStepPipeline:
         start_time = time.time()
         random_generators, actual_seeds = self.set_seeds(batch_size, manual_seeds)
-        retake_random_generators, actual_retake_seeds = self.set_seeds(batch_size, retake_seeds)
         if isinstance(oss_steps, str) and len(oss_steps) > 0:
             oss_steps = list(map(int, oss_steps.split(",")))
         else:
             oss_steps = []
         texts = [prompt]
-        encoder_text_hidden_states, text_attention_mask = self.get_text_embeddings(texts, self.device)
         encoder_text_hidden_states = encoder_text_hidden_states.repeat(batch_size, 1, 1)
         text_attention_mask = text_attention_mask.repeat(batch_size, 1)
         encoder_text_hidden_states_null = None
         if use_erg_tag:
-            encoder_text_hidden_states_null = self.get_text_embeddings_null(texts, self.device)
-            encoder_text_hidden_states_null = encoder_text_hidden_states_null.repeat(batch_size, 1, 1)
         # not support for released checkpoint
         speaker_embeds = torch.zeros(batch_size, 512).to(self.device).to(self.dtype)
@@ -1086,8 +1537,18 @@ class ACEStepPipeline:
         if len(lyrics) > 0:
             lyric_token_idx = self.tokenize_lyrics(lyrics, debug=debug)
             lyric_mask = [1] * len(lyric_token_idx)
-            lyric_token_idx = torch.tensor(lyric_token_idx).unsqueeze(0).to(self.device).repeat(batch_size, 1)
-            lyric_mask = torch.tensor(lyric_mask).unsqueeze(0).to(self.device).repeat(batch_size, 1)
         if audio_duration <= 0:
             audio_duration = random.uniform(30.0, 240.0)
@@ -1102,16 +1563,24 @@ class ACEStepPipeline:
         if task == "retake":
             repaint_start = 0
             repaint_end = audio_duration
         src_latents = None
         if src_audio_path is not None:
-            assert src_audio_path is not None and task in ("repaint", "edit", "extend"), "src_audio_path is required for retake/repaint/extend task"
-            assert os.path.exists(src_audio_path), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
         ref_latents = None
         if ref_audio_input is not None and audio2audio_enable:
-            assert ref_audio_input is not None, "ref_audio_input is required for audio2audio task"
             assert os.path.exists(
                 ref_audio_input
             ), f"ref_audio_input {ref_audio_input} does not exist"
@@ -1119,17 +1588,39 @@ class ACEStepPipeline:
         if task == "edit":
             texts = [edit_target_prompt]
-            target_encoder_text_hidden_states, target_text_attention_mask = self.get_text_embeddings(texts, self.device)
-            target_encoder_text_hidden_states = target_encoder_text_hidden_states.repeat(batch_size, 1, 1)
-            target_text_attention_mask = target_text_attention_mask.repeat(batch_size, 1)
-            target_lyric_token_idx = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
-            target_lyric_mask = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
             if len(edit_target_lyrics) > 0:
-                target_lyric_token_idx = self.tokenize_lyrics(edit_target_lyrics, debug=True)
                 target_lyric_mask = [1] * len(target_lyric_token_idx)
-                target_lyric_token_idx = torch.tensor(target_lyric_token_idx).unsqueeze(0).to(self.device).repeat(batch_size, 1)
-                target_lyric_mask = torch.tensor(target_lyric_mask).unsqueeze(0).to(self.device).repeat(batch_size, 1)
             target_speaker_embeds = speaker_embeds.clone()
@@ -1145,7 +1636,7 @@ class ACEStepPipeline:
                 target_lyric_token_ids=target_lyric_token_idx,
                 target_lyric_mask=target_lyric_mask,
                 src_latents=src_latents,
-                random_generators=retake_random_generators, # more diversity
                 infer_steps=infer_step,
                 guidance_scale=guidance_scale,
                 n_min=edit_n_min,
@@ -1233,7 +1724,7 @@ class ACEStepPipeline:
             "repaint_end": repaint_end,
             "edit_n_min": edit_n_min,
             "edit_n_max": edit_n_max,
-            "edit_n_avg": edit_n_avg,
             "src_audio_path": src_audio_path,
             "edit_target_prompt": edit_target_prompt,
             "edit_target_lyrics": edit_target_lyrics,
@@ -1243,7 +1734,9 @@ class ACEStepPipeline:
         }
         # save input_params_json
         for output_audio_path in output_paths:
-            input_params_json_save_path = output_audio_path.replace(f".{format}", "_input_params.json")
             input_params_json["audio_path"] = output_audio_path
             with open(input_params_json_save_path, "w", encoding="utf-8") as f:
                 json.dump(input_params_json, f, indent=4, ensure_ascii=False)

 from huggingface_hub import hf_hub_download, snapshot_download
 # from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from schedulers.scheduling_flow_match_euler_discrete import (
+    FlowMatchEulerDiscreteScheduler,
+)
+from schedulers.scheduling_flow_match_heun_discrete import (
+    FlowMatchHeunDiscreteScheduler,
+)
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import (
+    retrieve_timesteps,
+)
 from diffusers.utils.torch_utils import randn_tensor
 from transformers import UMT5EncoderModel, AutoTokenizer
 from music_dcae.music_dcae_pipeline import MusicDCAE
 from models.ace_step_transformer import ACEStepTransformer2DModel
 from models.lyrics_utils.lyric_tokenizer import VoiceBpeTokenizer
+from apg_guidance import (
+    apg_forward,
+    MomentumBuffer,
+    cfg_forward,
+    cfg_zero_star,
+    cfg_double_condition_forward,
+)
 import torchaudio
 import torio
 torch.backends.cudnn.benchmark = False
+torch.set_float32_matmul_precision("high")
 torch.backends.cudnn.deterministic = True
 torch.backends.cuda.matmul.allow_tf32 = True
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 SUPPORT_LANGUAGES = {
+    "en": 259,
+    "de": 260,
+    "fr": 262,
+    "es": 284,
+    "it": 285,
+    "pt": 286,
+    "pl": 294,
+    "tr": 295,
+    "ru": 267,
+    "cs": 293,
+    "nl": 297,
+    "ar": 5022,
+    "zh": 5023,
+    "ja": 5412,
+    "hu": 5753,
+    "ko": 6152,
+    "hi": 6680,
 }
 structure_pattern = re.compile(r"\[.*?\]")
 # class ACEStepPipeline(DiffusionPipeline):
 class ACEStepPipeline:
+    def __init__(
+        self,
+        checkpoint_dir=None,
+        device_id=0,
+        dtype="bfloat16",
+        text_encoder_checkpoint_path=None,
+        persistent_storage_path=None,
+        torch_compile=False,
+        **kwargs,
+    ):
         if not checkpoint_dir:
             if persistent_storage_path is None:
                 checkpoint_dir = os.path.join(os.path.dirname(__file__), "checkpoints")
                 checkpoint_dir = os.path.join(persistent_storage_path, "checkpoints")
         ensure_directory_exists(checkpoint_dir)
         self.checkpoint_dir = checkpoint_dir
+        device = (
+            torch.device(f"cuda:{device_id}")
+            if torch.cuda.is_available()
+            else torch.device("cpu")
+        )
         if device.type == "cpu" and torch.backends.mps.is_available():
             device = torch.device("mps")
         self.dtype = torch.bfloat16 if dtype == "bfloat16" else torch.float32
         self.loaded = False
         self.torch_compile = torch_compile
         self.lora_path = "none"
     def load_lora(self, lora_name_or_path):
         if lora_name_or_path != self.lora_path and lora_name_or_path != "none":
             if not os.path.exists(lora_name_or_path):
+                lora_download_path = snapshot_download(
+                    lora_name_or_path, cache_dir=self.checkpoint_dir
+                )
             else:
                 lora_download_path = lora_name_or_path
             if self.lora_path != "none":
                 self.ace_step_transformer.unload_lora()
+            self.ace_step_transformer.load_lora_adapter(
+                os.path.join(lora_download_path, "pytorch_lora_weights.safetensors"),
+                adapter_name="zh_rap_lora",
+                with_alpha=True,
+            )
+            logger.info(
+                f"Loading lora weights from: {lora_name_or_path} download path is: {lora_download_path}"
+            )
             self.lora_path = lora_name_or_path
         elif self.lora_path != "none" and lora_name_or_path == "none":
             logger.info("No lora weights to load.")
         text_encoder_model_path = os.path.join(checkpoint_dir, "umt5-base")
         files_exist = (
+            os.path.exists(os.path.join(dcae_model_path, "config.json"))
+            and os.path.exists(
+                os.path.join(dcae_model_path, "diffusion_pytorch_model.safetensors")
+            )
+            and os.path.exists(os.path.join(vocoder_model_path, "config.json"))
+            and os.path.exists(
+                os.path.join(vocoder_model_path, "diffusion_pytorch_model.safetensors")
+            )
+            and os.path.exists(os.path.join(ace_step_model_path, "config.json"))
+            and os.path.exists(
+                os.path.join(ace_step_model_path, "diffusion_pytorch_model.safetensors")
+            )
+            and os.path.exists(os.path.join(text_encoder_model_path, "config.json"))
+            and os.path.exists(
+                os.path.join(text_encoder_model_path, "model.safetensors")
+            )
+            and os.path.exists(
+                os.path.join(text_encoder_model_path, "special_tokens_map.json")
+            )
+            and os.path.exists(
+                os.path.join(text_encoder_model_path, "tokenizer_config.json")
+            )
+            and os.path.exists(os.path.join(text_encoder_model_path, "tokenizer.json"))
         )
         if not files_exist:
+            logger.info(
+                f"Checkpoint directory {checkpoint_dir} is not complete, downloading from Hugging Face Hub"
+            )
             # download music dcae model
             os.makedirs(dcae_model_path, exist_ok=True)
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="music_dcae_f8c8",
+                filename="config.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="music_dcae_f8c8",
+                filename="diffusion_pytorch_model.safetensors",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
             # download vocoder model
             os.makedirs(vocoder_model_path, exist_ok=True)
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="music_vocoder",
+                filename="config.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="music_vocoder",
+                filename="diffusion_pytorch_model.safetensors",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
             # download ace_step transformer model
             os.makedirs(ace_step_model_path, exist_ok=True)
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="ace_step_transformer",
+                filename="config.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="ace_step_transformer",
+                filename="diffusion_pytorch_model.safetensors",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
             # download text encoder model
             os.makedirs(text_encoder_model_path, exist_ok=True)
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="umt5-base",
+                filename="config.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="umt5-base",
+                filename="model.safetensors",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="umt5-base",
+                filename="special_tokens_map.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="umt5-base",
+                filename="tokenizer_config.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
+            hf_hub_download(
+                repo_id=REPO_ID,
+                subfolder="umt5-base",
+                filename="tokenizer.json",
+                local_dir=checkpoint_dir,
+                local_dir_use_symlinks=False,
+            )
             logger.info("Models downloaded")
         ace_step_checkpoint_path = ace_step_model_path
         text_encoder_checkpoint_path = text_encoder_model_path
+        self.music_dcae = MusicDCAE(
+            dcae_checkpoint_path=dcae_checkpoint_path,
+            vocoder_checkpoint_path=vocoder_checkpoint_path,
+        )
         self.music_dcae.to(device).eval().to(self.dtype)
+        self.ace_step_transformer = ACEStepTransformer2DModel.from_pretrained(
+            ace_step_checkpoint_path, torch_dtype=self.dtype
+        )
         self.ace_step_transformer.to(device).eval().to(self.dtype)
         lang_segment = LangSegment()
+        lang_segment.setfilters(
+            [
+                "af",
+                "am",
+                "an",
+                "ar",
+                "as",
+                "az",
+                "be",
+                "bg",
+                "bn",
+                "br",
+                "bs",
+                "ca",
+                "cs",
+                "cy",
+                "da",
+                "de",
+                "dz",
+                "el",
+                "en",
+                "eo",
+                "es",
+                "et",
+                "eu",
+                "fa",
+                "fi",
+                "fo",
+                "fr",
+                "ga",
+                "gl",
+                "gu",
+                "he",
+                "hi",
+                "hr",
+                "ht",
+                "hu",
+                "hy",
+                "id",
+                "is",
+                "it",
+                "ja",
+                "jv",
+                "ka",
+                "kk",
+                "km",
+                "kn",
+                "ko",
+                "ku",
+                "ky",
+                "la",
+                "lb",
+                "lo",
+                "lt",
+                "lv",
+                "mg",
+                "mk",
+                "ml",
+                "mn",
+                "mr",
+                "ms",
+                "mt",
+                "nb",
+                "ne",
+                "nl",
+                "nn",
+                "no",
+                "oc",
+                "or",
+                "pa",
+                "pl",
+                "ps",
+                "pt",
+                "qu",
+                "ro",
+                "ru",
+                "rw",
+                "se",
+                "si",
+                "sk",
+                "sl",
+                "sq",
+                "sr",
+                "sv",
+                "sw",
+                "ta",
+                "te",
+                "th",
+                "tl",
+                "tr",
+                "ug",
+                "uk",
+                "ur",
+                "vi",
+                "vo",
+                "wa",
+                "xh",
+                "zh",
+                "zu",
+            ]
+        )
         self.lang_segment = lang_segment
         self.lyric_tokenizer = VoiceBpeTokenizer()
+        text_encoder_model = UMT5EncoderModel.from_pretrained(
+            text_encoder_checkpoint_path, torch_dtype=self.dtype
+        ).eval()
         text_encoder_model = text_encoder_model.to(device).to(self.dtype)
         text_encoder_model.requires_grad_(False)
         self.text_encoder_model = text_encoder_model
+        self.text_tokenizer = AutoTokenizer.from_pretrained(
+            text_encoder_checkpoint_path
+        )
         self.loaded = True
         # compile
             self.text_encoder_model = torch.compile(self.text_encoder_model)
     def get_text_embeddings(self, texts, device, text_max_length=256):
+        inputs = self.text_tokenizer(
+            texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=text_max_length,
+        )
         inputs = {key: value.to(device) for key, value in inputs.items()}
         if self.text_encoder_model.device != device:
             self.text_encoder_model.to(device)
             last_hidden_states = outputs.last_hidden_state
         attention_mask = inputs["attention_mask"]
         return last_hidden_states, attention_mask
+    def get_text_embeddings_null(
+        self, texts, device, text_max_length=256, tau=0.01, l_min=8, l_max=10
+    ):
+        inputs = self.text_tokenizer(
+            texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=text_max_length,
+        )
         inputs = {key: value.to(device) for key, value in inputs.items()}
         if self.text_encoder_model.device != device:
             self.text_encoder_model.to(device)
         def forward_with_temperature(inputs, tau=0.01, l_min=8, l_max=10):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
+                handler = (
+                    self.text_encoder_model.encoder.block[i]
+                    .layer[0]
+                    .SelfAttention.q.register_forward_hook(hook)
+                )
                 handlers.append(handler)
             with torch.no_grad():
                 outputs = self.text_encoder_model(**inputs)
                 last_hidden_states = outputs.last_hidden_state
             for hook in handlers:
                 hook.remove()
             return last_hidden_states
         last_hidden_states = forward_with_temperature(inputs, tau, l_min, l_max)
         return last_hidden_states
     def set_seeds(self, batch_size, manual_seeds=None):
+        processed_input_seeds = None
         if manual_seeds is not None:
             if isinstance(manual_seeds, str):
                 if "," in manual_seeds:
+                    processed_input_seeds = list(map(int, manual_seeds.split(",")))
                 elif manual_seeds.isdigit():
+                    processed_input_seeds = int(manual_seeds)
+            elif isinstance(manual_seeds, list) and all(
+                isinstance(s, int) for s in manual_seeds
+            ):
+                if len(manual_seeds) > 0:
+                    processed_input_seeds = list(manual_seeds)
+            elif isinstance(manual_seeds, int):
+                processed_input_seeds = manual_seeds
+        random_generators = [
+            torch.Generator(device=self.device) for _ in range(batch_size)
+        ]
         actual_seeds = []
         for i in range(batch_size):
+            current_seed_for_generator = None
+            if processed_input_seeds is None:
+                current_seed_for_generator = torch.randint(0, 2**32, (1,)).item()
+            elif isinstance(processed_input_seeds, int):
+                current_seed_for_generator = processed_input_seeds
+            elif isinstance(processed_input_seeds, list):
+                if i < len(processed_input_seeds):
+                    current_seed_for_generator = processed_input_seeds[i]
+                else:
+                    current_seed_for_generator = processed_input_seeds[-1]
+            if current_seed_for_generator is None:
+                current_seed_for_generator = torch.randint(0, 2**32, (1,)).item()
+            random_generators[i].manual_seed(current_seed_for_generator)
+            actual_seeds.append(current_seed_for_generator)
         return random_generators, actual_seeds
     def get_lang(self, text):
         language = "en"
+        try:
             _ = self.lang_segment.getTexts(text)
             langCounts = self.lang_segment.getCounts()
             language = langCounts[0][0]
                 else:
                     token_idx = self.lyric_tokenizer.encode(line, lang)
                 if debug:
+                    toks = self.lyric_tokenizer.batch_decode(
+                        [[tok_id] for tok_id in token_idx]
+                    )
                     logger.info(f"debbug {line} --> {lang} --> {toks}")
                 lyric_token_idx = lyric_token_idx + token_idx + [2]
             except Exception as e:
         attention_mask=None,
         momentum_buffer=None,
         momentum_buffer_tar=None,
+        return_src_pred=True,
     ):
         noise_pred_src = None
         if return_src_pred:
+            src_latent_model_input = (
+                torch.cat([zt_src, zt_src]) if do_classifier_free_guidance else zt_src
+            )
             timestep = t.expand(src_latent_model_input.shape[0])
             # source
             noise_pred_src = self.ace_step_transformer(
             ).sample
             if do_classifier_free_guidance:
+                noise_pred_with_cond_src, noise_pred_uncond_src = noise_pred_src.chunk(
+                    2
+                )
                 if cfg_type == "apg":
                     noise_pred_src = apg_forward(
                         pred_cond=noise_pred_with_cond_src,
                         cfg_strength=guidance_scale,
                     )
+        tar_latent_model_input = (
+            torch.cat([zt_tar, zt_tar]) if do_classifier_free_guidance else zt_tar
+        )
         timestep = t.expand(tar_latent_model_input.shape[0])
         # target
         noise_pred_tar = self.ace_step_transformer(
         T_steps = infer_steps
         frame_length = src_latents.shape[-1]
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
+        timesteps, T_steps = retrieve_timesteps(
+            scheduler, T_steps, device, timesteps=None
+        )
         if do_classifier_free_guidance:
             attention_mask = torch.cat([attention_mask] * 2, dim=0)
+            encoder_text_hidden_states = torch.cat(
+                [
+                    encoder_text_hidden_states,
+                    torch.zeros_like(encoder_text_hidden_states),
+                ],
+                0,
+            )
             text_attention_mask = torch.cat([text_attention_mask] * 2, dim=0)
+            target_encoder_text_hidden_states = torch.cat(
+                [
+                    target_encoder_text_hidden_states,
+                    torch.zeros_like(target_encoder_text_hidden_states),
+                ],
+                0,
+            )
+            target_text_attention_mask = torch.cat(
+                [target_text_attention_mask] * 2, dim=0
+            )
+            speaker_embds = torch.cat(
+                [speaker_embds, torch.zeros_like(speaker_embds)], 0
+            )
+            target_speaker_embeds = torch.cat(
+                [target_speaker_embeds, torch.zeros_like(target_speaker_embeds)], 0
+            )
+            lyric_token_ids = torch.cat(
+                [lyric_token_ids, torch.zeros_like(lyric_token_ids)], 0
+            )
             lyric_mask = torch.cat([lyric_mask, torch.zeros_like(lyric_mask)], 0)
+            target_lyric_token_ids = torch.cat(
+                [target_lyric_token_ids, torch.zeros_like(target_lyric_token_ids)], 0
+            )
+            target_lyric_mask = torch.cat(
+                [target_lyric_mask, torch.zeros_like(target_lyric_mask)], 0
+            )
         momentum_buffer = MomentumBuffer()
         momentum_buffer_tar = MomentumBuffer()
             if i < n_min:
                 continue
+            t_i = t / 1000
+            if i + 1 < len(timesteps):
+                t_im1 = (timesteps[i + 1]) / 1000
             else:
                 t_im1 = torch.zeros_like(t_i).to(t_i.device)
                 # Calculate the average of the V predictions
                 V_delta_avg = torch.zeros_like(x_src)
                 for k in range(n_avg):
+                    fwd_noise = randn_tensor(
+                        shape=x_src.shape,
+                        generator=random_generators,
+                        device=device,
+                        dtype=dtype,
+                    )
                     zt_src = (1 - t_i) * x_src + (t_i) * fwd_noise
                         guidance_scale=guidance_scale,
                         target_guidance_scale=target_guidance_scale,
                         attention_mask=attention_mask,
+                        momentum_buffer=momentum_buffer,
                     )
+                    V_delta_avg += (1 / n_avg) * (
+                        Vt_tar - Vt_src
+                    )  # - (hfg-1)*( x_src))
                 # propagate direct ODE
                 zt_edit = zt_edit.to(torch.float32)
                 zt_edit = zt_edit + (t_im1 - t_i) * V_delta_avg
                 zt_edit = zt_edit.to(V_delta_avg.dtype)
+            else:  # i >= T_steps-n_min # regular sampling for last n_min steps
                 if i == n_max:
+                    fwd_noise = randn_tensor(
+                        shape=x_src.shape,
+                        generator=random_generators,
+                        device=device,
+                        dtype=dtype,
+                    )
                     scheduler._init_step_index(t)
                     sigma = scheduler.sigmas[scheduler.step_index]
                     xt_src = sigma * fwd_noise + (1.0 - sigma) * x_src
                     xt_tar = zt_edit + xt_src - x_src
                 _, Vt_tar = self.calc_v(
                     zt_src=None,
                     zt_tar=xt_tar,
                     momentum_buffer_tar=momentum_buffer_tar,
                     return_src_pred=False,
                 )
                 dtype = Vt_tar.dtype
                 xt_tar = xt_tar.to(torch.float32)
                 prev_sample = xt_tar + (t_im1 - t_i) * Vt_tar
+                prev_sample = prev_sample.to(dtype)
                 xt_tar = prev_sample
         target_latents = zt_edit if xt_tar is None else xt_tar
         return target_latents
         timesteps = scheduler.timesteps.unsqueeze(1).to(gt_latents.dtype)
         indices = indices.to(timesteps.device).to(gt_latents.dtype).unsqueeze(1)
         nearest_idx = torch.argmin(torch.cdist(indices, timesteps), dim=1)
+        sigma = (
+            scheduler.sigmas[nearest_idx]
+            .flatten()
+            .to(gt_latents.device)
+            .to(gt_latents.dtype)
+        )
         while len(sigma.shape) < gt_latents.ndim:
             sigma = sigma.unsqueeze(-1)
         noisy_image = sigma * noise + (1.0 - sigma) * gt_latents
         ref_latents=None,
     ):
+        logger.info(
+            "cfg_type: {}, guidance_scale: {}, omega_scale: {}".format(
+                cfg_type, guidance_scale, omega_scale
+            )
+        )
         do_classifier_free_guidance = True
         if guidance_scale == 0.0 or guidance_scale == 1.0:
             do_classifier_free_guidance = False
         do_double_condition_guidance = False
+        if (
+            guidance_scale_text is not None
+            and guidance_scale_text > 1.0
+            and guidance_scale_lyric is not None
+            and guidance_scale_lyric > 1.0
+        ):
             do_double_condition_guidance = True
+            logger.info(
+                "do_double_condition_guidance: {}, guidance_scale_text: {}, guidance_scale_lyric: {}".format(
+                    do_double_condition_guidance,
+                    guidance_scale_text,
+                    guidance_scale_lyric,
+                )
+            )
         device = encoder_text_hidden_states.device
         dtype = encoder_text_hidden_states.dtype
                 num_train_timesteps=1000,
                 shift=3.0,
             )
         frame_length = int(duration * 44100 / 512 / 8)
         if src_latents is not None:
             frame_length = src_latents.shape[-1]
         if len(oss_steps) > 0:
             infer_steps = max(oss_steps)
             scheduler.set_timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(
+                scheduler,
+                num_inference_steps=infer_steps,
+                device=device,
+                timesteps=None,
+            )
             new_timesteps = torch.zeros(len(oss_steps), dtype=dtype, device=device)
             for idx in range(len(oss_steps)):
+                new_timesteps[idx] = timesteps[oss_steps[idx] - 1]
             num_inference_steps = len(oss_steps)
             sigmas = (new_timesteps / 1000).float().cpu().numpy()
+            timesteps, num_inference_steps = retrieve_timesteps(
+                scheduler,
+                num_inference_steps=num_inference_steps,
+                device=device,
+                sigmas=sigmas,
+            )
+            logger.info(
+                f"oss_steps: {oss_steps}, num_inference_steps: {num_inference_steps} after remapping to timesteps {timesteps}"
+            )
         else:
+            timesteps, num_inference_steps = retrieve_timesteps(
+                scheduler,
+                num_inference_steps=infer_steps,
+                device=device,
+                timesteps=None,
+            )
+        target_latents = randn_tensor(
+            shape=(bsz, 8, 16, frame_length),
+            generator=random_generators,
+            device=device,
+            dtype=dtype,
+        )
         is_repaint = False
+        is_extend = False
         if add_retake_noise:
             n_min = int(infer_steps * (1 - retake_variance))
+            retake_variance = (
+                torch.tensor(retake_variance * math.pi / 2).to(device).to(dtype)
+            )
+            retake_latents = randn_tensor(
+                shape=(bsz, 8, 16, frame_length),
+                generator=retake_random_generators,
+                device=device,
+                dtype=dtype,
+            )
             repaint_start_frame = int(repaint_start * 44100 / 512 / 8)
             repaint_end_frame = int(repaint_end * 44100 / 512 / 8)
             x0 = src_latents
             # retake
+            is_repaint = repaint_end_frame - repaint_start_frame != frame_length
             is_extend = (repaint_start_frame < 0) or (repaint_end_frame > frame_length)
             if is_extend:
                 is_repaint = True
             # TODO: train a mask aware repainting controlnet
             # to make sure mean = 0, std = 1
             if not is_repaint:
+                target_latents = (
+                    torch.cos(retake_variance) * target_latents
+                    + torch.sin(retake_variance) * retake_latents
+                )
             elif not is_extend:
+                # if repaint_end_frame
+                repaint_mask = torch.zeros(
+                    (bsz, 8, 16, frame_length), device=device, dtype=dtype
+                )
                 repaint_mask[:, :, :, repaint_start_frame:repaint_end_frame] = 1.0
+                repaint_noise = (
+                    torch.cos(retake_variance) * target_latents
+                    + torch.sin(retake_variance) * retake_latents
+                )
+                repaint_noise = torch.where(
+                    repaint_mask == 1.0, repaint_noise, target_latents
+                )
                 zt_edit = x0.clone()
                 z0 = repaint_noise
             elif is_extend:
                 if repaint_start_frame < 0:
                     left_pad_frame_length = abs(repaint_start_frame)
                     frame_length = left_pad_frame_length + gt_latents.shape[-1]
+                    extend_gt_latents = torch.nn.functional.pad(
+                        gt_latents, (left_pad_frame_length, 0), "constant", 0
+                    )
                     if frame_length > max_infer_fame_length:
                         right_trim_length = frame_length - max_infer_fame_length
+                        extend_gt_latents = extend_gt_latents[
+                            :, :, :, :max_infer_fame_length
+                        ]
+                        to_right_pad_gt_latents = extend_gt_latents[
+                            :, :, :, -right_trim_length:
+                        ]
                         frame_length = max_infer_fame_length
                     repaint_start_frame = 0
                     gt_latents = extend_gt_latents
                 if repaint_end_frame > src_latents_length:
                     right_pad_frame_length = repaint_end_frame - gt_latents.shape[-1]
                     frame_length = gt_latents.shape[-1] + right_pad_frame_length
+                    extend_gt_latents = torch.nn.functional.pad(
+                        gt_latents, (0, right_pad_frame_length), "constant", 0
+                    )
                     if frame_length > max_infer_fame_length:
                         left_trim_length = frame_length - max_infer_fame_length
+                        extend_gt_latents = extend_gt_latents[
+                            :, :, :, -max_infer_fame_length:
+                        ]
+                        to_left_pad_gt_latents = extend_gt_latents[
+                            :, :, :, :left_trim_length
+                        ]
                         frame_length = max_infer_fame_length
                     repaint_end_frame = frame_length
                     gt_latents = extend_gt_latents
+                repaint_mask = torch.zeros(
+                    (bsz, 8, 16, frame_length), device=device, dtype=dtype
+                )
                 if left_pad_frame_length > 0:
+                    repaint_mask[:, :, :, :left_pad_frame_length] = 1.0
                 if right_pad_frame_length > 0:
+                    repaint_mask[:, :, :, -right_pad_frame_length:] = 1.0
                 x0 = gt_latents
                 padd_list = []
                 if left_pad_frame_length > 0:
                     padd_list.append(retake_latents[:, :, :, :left_pad_frame_length])
+                padd_list.append(
+                    target_latents[
+                        :,
+                        :,
+                        :,
+                        left_trim_length : target_latents.shape[-1] - right_trim_length,
+                    ]
+                )
                 if right_pad_frame_length > 0:
                     padd_list.append(retake_latents[:, :, :, -right_pad_frame_length:])
                 target_latents = torch.cat(padd_list, dim=-1)
+                assert (
+                    target_latents.shape[-1] == x0.shape[-1]
+                ), f"{target_latents.shape=} {x0.shape=}"
                 zt_edit = x0.clone()
                 z0 = target_latents
         init_timestep = 1000
         if audio2audio_enable and ref_latents is not None:
+            target_latents, init_timestep = self.add_latents_noise(
+                gt_latents=ref_latents,
+                variance=(1 - ref_audio_strength),
+                noise=target_latents,
+                scheduler=scheduler,
+            )
         attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype)
         # guidance interval
         start_idx = int(num_inference_steps * ((1 - guidance_interval) / 2))
         end_idx = int(num_inference_steps * (guidance_interval / 2 + 0.5))
+        logger.info(
+            f"start_idx: {start_idx}, end_idx: {end_idx}, num_inference_steps: {num_inference_steps}"
+        )
         momentum_buffer = MomentumBuffer()
         def forward_encoder_with_temperature(self, inputs, tau=0.01, l_min=4, l_max=6):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
+                handler = self.ace_step_transformer.lyric_encoder.encoders[
+                    i
+                ].self_attn.linear_q.register_forward_hook(hook)
                 handlers.append(handler)
+            encoder_hidden_states, encoder_hidden_mask = (
+                self.ace_step_transformer.encode(**inputs)
+            )
             for hook in handlers:
                 hook.remove()
             return encoder_hidden_states
         # P(speaker, text, lyric)
             encoder_hidden_states_null = forward_encoder_with_temperature(
                 self,
                 inputs={
+                    "encoder_text_hidden_states": (
+                        encoder_text_hidden_states_null
+                        if encoder_text_hidden_states_null is not None
+                        else torch.zeros_like(encoder_text_hidden_states)
+                    ),
                     "text_attention_mask": text_attention_mask,
                     "speaker_embeds": torch.zeros_like(speaker_embds),
                     "lyric_token_idx": lyric_token_ids,
                     "lyric_mask": lyric_mask,
+                },
             )
         else:
             # P(null_speaker, null_text, null_lyric)
                 torch.zeros_like(lyric_token_ids),
                 lyric_mask,
             )
         encoder_hidden_states_no_lyric = None
         if do_double_condition_guidance:
             # P(null_speaker, text, lyric_weaker)
                         "speaker_embeds": torch.zeros_like(speaker_embds),
                         "lyric_token_idx": lyric_token_ids,
                         "lyric_mask": lyric_mask,
+                    },
                 )
             # P(null_speaker, text, no_lyric)
             else:
                     lyric_mask,
                 )
+        def forward_diffusion_with_temperature(
+            self, hidden_states, timestep, inputs, tau=0.01, l_min=15, l_max=20
+        ):
             handlers = []
             def hook(module, input, output):
                 output[:] *= tau
                 return output
             for i in range(l_min, l_max):
+                handler = self.ace_step_transformer.transformer_blocks[
+                    i
+                ].attn.to_q.register_forward_hook(hook)
                 handlers.append(handler)
+                handler = self.ace_step_transformer.transformer_blocks[
+                    i
+                ].cross_attn.to_q.register_forward_hook(hook)
                 handlers.append(handler)
+            sample = self.ace_step_transformer.decode(
+                hidden_states=hidden_states, timestep=timestep, **inputs
+            ).sample
             for hook in handlers:
                 hook.remove()
             return sample
         for i, t in tqdm(enumerate(timesteps), total=num_inference_steps):
             if t > init_timestep:
                 # compute current guidance scale
                 if guidance_interval_decay > 0:
                     # Linearly interpolate to calculate the current guidance scale
+                    progress = (i - start_idx) / (
+                        end_idx - start_idx - 1
+                    )  # 归一化到[0,1]
+                    current_guidance_scale = (
+                        guidance_scale
+                        - (guidance_scale - min_guidance_scale)
+                        * progress
+                        * guidance_interval_decay
+                    )
                 else:
                     current_guidance_scale = guidance_scale
                 ).sample
                 noise_pred_with_only_text_cond = None
+                if (
+                    do_double_condition_guidance
+                    and encoder_hidden_states_no_lyric is not None
+                ):
                     noise_pred_with_only_text_cond = self.ace_step_transformer.decode(
                         hidden_states=latent_model_input,
                         attention_mask=attention_mask,
                         timestep=timestep,
                     ).sample
+                if (
+                    do_double_condition_guidance
+                    and noise_pred_with_only_text_cond is not None
+                ):
                     noise_pred = cfg_double_condition_forward(
                         cond_output=noise_pred_with_cond,
                         uncond_output=noise_pred_uncond,
                         guidance_scale=current_guidance_scale,
                         i=i,
                         zero_steps=zero_steps,
+                        use_zero_init=use_zero_init,
                     )
             else:
                 latent_model_input = latents
                 ).sample
             if is_repaint and i >= n_min:
+                t_i = t / 1000
+                if i + 1 < len(timesteps):
+                    t_im1 = (timesteps[i + 1]) / 1000
                 else:
                     t_im1 = torch.zeros_like(t_i).to(t_i.device)
                 dtype = noise_pred.dtype
                 prev_sample = prev_sample.to(dtype)
                 target_latents = prev_sample
                 zt_src = (1 - t_im1) * x0 + (t_im1) * z0
+                target_latents = torch.where(
+                    repaint_mask == 1.0, target_latents, zt_src
+                )
             else:
+                target_latents = scheduler.step(
+                    model_output=noise_pred,
+                    timestep=t,
+                    sample=target_latents,
+                    return_dict=False,
+                    omega=omega_scale,
+                )[0]
         if is_extend:
             if to_right_pad_gt_latents is not None:
+                target_latents = torch.cat(
+                    [target_latents, to_right_pad_gt_latents], dim=-1
+                )
             if to_left_pad_gt_latents is not None:
+                target_latents = torch.cat(
+                    [to_right_pad_gt_latents, target_latents], dim=0
+                )
         return target_latents
+    def latents2audio(
+        self,
+        latents,
+        target_wav_duration_second=30,
+        sample_rate=48000,
+        save_path=None,
+        format="mp3",
+    ):
         output_audio_paths = []
         bs = latents.shape[0]
         audio_lengths = [target_wav_duration_second * sample_rate] * bs
             _, pred_wavs = self.music_dcae.decode(pred_latents, sr=sample_rate)
         pred_wavs = [pred_wav.cpu().float() for pred_wav in pred_wavs]
         for i in tqdm(range(bs)):
+            output_audio_path = self.save_wav_file(
+                pred_wavs[i], i, sample_rate=sample_rate
+            )
             output_audio_paths.append(output_audio_path)
         return output_audio_paths
+    def save_wav_file(
+        self, target_wav, idx, save_path=None, sample_rate=48000, format="mp3"
+    ):
         if save_path is None:
             logger.warning("save_path is None, using default path ./outputs/")
             base_path = f"./outputs"
             base_path = save_path
             ensure_directory_exists(base_path)
+        output_path_flac = (
+            f"{base_path}/output_{time.strftime('%Y%m%d%H%M%S')}_{idx}.{format}"
+        )
         target_wav = target_wav.float()
+        torchaudio.save(
+            output_path_flac,
+            target_wav,
+            sample_rate=sample_rate,
+            format=format,
+            compression=torio.io.CodecConfig(bit_rate=320000),
+        )
         return output_path_flac
     def infer_latents(self, input_audio_path):
         omega_scale: int = 10.0,
         manual_seeds: list = None,
         guidance_interval: float = 0.5,
+        guidance_interval_decay: float = 0.0,
         min_guidance_scale: float = 3.0,
         use_erg_tag: bool = True,
         use_erg_lyric: bool = True,
         start_time = time.time()
         random_generators, actual_seeds = self.set_seeds(batch_size, manual_seeds)
+        retake_random_generators, actual_retake_seeds = self.set_seeds(
+            batch_size, retake_seeds
+        )
         if isinstance(oss_steps, str) and len(oss_steps) > 0:
             oss_steps = list(map(int, oss_steps.split(",")))
         else:
             oss_steps = []
         texts = [prompt]
+        encoder_text_hidden_states, text_attention_mask = self.get_text_embeddings(
+            texts, self.device
+        )
         encoder_text_hidden_states = encoder_text_hidden_states.repeat(batch_size, 1, 1)
         text_attention_mask = text_attention_mask.repeat(batch_size, 1)
         encoder_text_hidden_states_null = None
         if use_erg_tag:
+            encoder_text_hidden_states_null = self.get_text_embeddings_null(
+                texts, self.device
+            )
+            encoder_text_hidden_states_null = encoder_text_hidden_states_null.repeat(
+                batch_size, 1, 1
+            )
         # not support for released checkpoint
         speaker_embeds = torch.zeros(batch_size, 512).to(self.device).to(self.dtype)
         if len(lyrics) > 0:
             lyric_token_idx = self.tokenize_lyrics(lyrics, debug=debug)
             lyric_mask = [1] * len(lyric_token_idx)
+            lyric_token_idx = (
+                torch.tensor(lyric_token_idx)
+                .unsqueeze(0)
+                .to(self.device)
+                .repeat(batch_size, 1)
+            )
+            lyric_mask = (
+                torch.tensor(lyric_mask)
+                .unsqueeze(0)
+                .to(self.device)
+                .repeat(batch_size, 1)
+            )
         if audio_duration <= 0:
             audio_duration = random.uniform(30.0, 240.0)
         if task == "retake":
             repaint_start = 0
             repaint_end = audio_duration
         src_latents = None
         if src_audio_path is not None:
+            assert src_audio_path is not None and task in (
+                "repaint",
+                "edit",
+                "extend",
+            ), "src_audio_path is required for retake/repaint/extend task"
+            assert os.path.exists(
+                src_audio_path
+            ), f"src_audio_path {src_audio_path} does not exist"
             src_latents = self.infer_latents(src_audio_path)
         ref_latents = None
         if ref_audio_input is not None and audio2audio_enable:
+            assert (
+                ref_audio_input is not None
+            ), "ref_audio_input is required for audio2audio task"
             assert os.path.exists(
                 ref_audio_input
             ), f"ref_audio_input {ref_audio_input} does not exist"
         if task == "edit":
             texts = [edit_target_prompt]
+            target_encoder_text_hidden_states, target_text_attention_mask = (
+                self.get_text_embeddings(texts, self.device)
+            )
+            target_encoder_text_hidden_states = (
+                target_encoder_text_hidden_states.repeat(batch_size, 1, 1)
+            )
+            target_text_attention_mask = target_text_attention_mask.repeat(
+                batch_size, 1
+            )
+            target_lyric_token_idx = (
+                torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
+            )
+            target_lyric_mask = (
+                torch.tensor([0]).repeat(batch_size, 1).to(self.device).long()
+            )
             if len(edit_target_lyrics) > 0:
+                target_lyric_token_idx = self.tokenize_lyrics(
+                    edit_target_lyrics, debug=True
+                )
                 target_lyric_mask = [1] * len(target_lyric_token_idx)
+                target_lyric_token_idx = (
+                    torch.tensor(target_lyric_token_idx)
+                    .unsqueeze(0)
+                    .to(self.device)
+                    .repeat(batch_size, 1)
+                )
+                target_lyric_mask = (
+                    torch.tensor(target_lyric_mask)
+                    .unsqueeze(0)
+                    .to(self.device)
+                    .repeat(batch_size, 1)
+                )
             target_speaker_embeds = speaker_embeds.clone()
                 target_lyric_token_ids=target_lyric_token_idx,
                 target_lyric_mask=target_lyric_mask,
                 src_latents=src_latents,
+                random_generators=retake_random_generators,  # more diversity
                 infer_steps=infer_step,
                 guidance_scale=guidance_scale,
                 n_min=edit_n_min,
             "repaint_end": repaint_end,
             "edit_n_min": edit_n_min,
             "edit_n_max": edit_n_max,
+            "edit_n_avg": edit_n_avg,
             "src_audio_path": src_audio_path,
             "edit_target_prompt": edit_target_prompt,
             "edit_target_lyrics": edit_target_lyrics,
         }
         # save input_params_json
         for output_audio_path in output_paths:
+            input_params_json_save_path = output_audio_path.replace(
+                f".{format}", "_input_params.json"
+            )
             input_params_json["audio_path"] = output_audio_path
             with open(input_params_json_save_path, "w", encoding="utf-8") as f:
                 json.dump(input_params_json, f, indent=4, ensure_ascii=False)