Erasing-Concepts-In-Diffusion

Runtime error

App Files Files Community

Damian Stewart commited on Aug 13, 2023

Commit

bf1e262

1 Parent(s): 52c8f3c

allow multiple train prompts

Browse files

Files changed (3) hide show

app.py +49 -37
memory_efficiency.py +4 -1
train.py +75 -61

app.py CHANGED Viewed

@@ -12,15 +12,20 @@ from train import train, training_should_cancel
 import os
 model_map = {}
-def populate_model_map():
     global model_map
     for model_file in os.listdir('models'):
         path = 'models/' + model_file
         if any([existing_path == path for existing_path in model_map.values()]):
             continue
         model_map[model_file] = path
-    return model_map
-model_map = populate_model_map()
 ORIGINAL_SPACE_ID = 'baulab/Erasing-Concepts-In-Diffusion'
 SPACE_ID = os.getenv('SPACE_ID')
@@ -64,6 +69,12 @@ class Demo:
                     with gr.Column(scale=1):
                         self.prompt_input_infr = gr.Text(
                             placeholder="Enter prompt...",
                             label="Prompt",
@@ -104,12 +115,6 @@ class Demo:
                                 interactive=True
                             )
-                        self.base_repo_id_or_path_input_infr = gr.Text(
-                            label="Base model",
-                            value="CompVis/stable-diffusion-v1-4",
-                            info="Path or huggingface repo id of the base model that this edit was done against"
-                        )
                     with gr.Column(scale=2):
                         self.infr_button = gr.Button(
@@ -152,19 +157,10 @@ class Demo:
                             info="Image size for training, should match the model's native image size"
                         )
-                        self.train_sample_batch_size_input = gr.Slider(
-                            value=1,
-                            step=1,
-                            minimum=1,
-                            maximum=32,
-                            label="Sample generation batch size",
-                            info="Batch size for sample generation, larger needs more VRAM"
-                        )
-                        self.prompt_input = gr.Text(
-                            placeholder="Enter prompt...",
-                            label="Prompt to Erase",
-                            info="Prompt corresponding to concept to erase"
                         )
                         choices = ['ESD-x', 'ESD-self', 'ESD-u']
@@ -175,7 +171,7 @@ class Demo:
                             choices=choices,
                             value='ESD-x',
                             label='Train Method',
-                            info='Method of training'
                         )
                         self.neg_guidance_input = gr.Number(
@@ -233,11 +229,21 @@ class Demo:
                             value='',
                             info="Negative prompts for use when generating sample images. One for each positive prompt, or leave empty for none."
                         )
-                        self.train_validate_every_n_steps = gr.Number(
-                            label="Validate Every N Steps",
-                            value=20,
-                            info="Validation and sample generation will be run at intervals of this many steps"
-                        )
                     with gr.Column(scale=1):
@@ -311,7 +317,7 @@ class Demo:
         train_event = self.train_button.click(self.train, inputs = [
             self.train_model_input,
             self.train_img_size_input,
-            self.prompt_input,
             self.train_method_input,
             self.neg_guidance_input,
             self.iterations_input,
@@ -346,9 +352,9 @@ class Demo:
     def reload_models(self, model_dropdown):
         current_model_name = model_dropdown
-        global model_map
-        populate_model_map()
-        return [self.model_dropdown.update(choices=list(model_map.keys()), value=current_model_name)]
     def cancel_training(self):
         if self.training:
@@ -356,7 +362,7 @@ class Demo:
             print("cancellation requested...")
         return [gr.update(value="Cancelling...", interactive=True)]
-    def train(self, repo_id_or_path, img_size, prompt, train_method, neg_guidance, iterations, lr,
               use_adamw8bit=True, use_xformers=False, use_amp=False, use_gradient_checkpointing=False,
               seed=-1, save_every=-1, sample_batch_size=1,
               validation_prompts: str=None, sample_positive_prompts: str=None, sample_negative_prompts: str=None, validate_every_n_steps=-1,
@@ -365,7 +371,7 @@ class Demo:
         :param repo_id_or_path:
         :param img_size:
-        :param prompt:
         :param train_method:
         :param neg_guidance:
         :param iterations:
@@ -386,7 +392,7 @@ class Demo:
         if self.training:
             return [gr.update(interactive=True, value='Train'), gr.update(value='Someone else is training... Try again soon'), None, gr.update()]
-        print(f"Training {repo_id_or_path} at {img_size} to remove '{prompt}'.")
         print(f"  {train_method}, negative guidance {neg_guidance}, lr {lr}, {iterations} iterations.")
         print(f" {'✅' if use_gradient_checkpointing else '❌'} gradient checkpointing")
         print(f" {'✅' if use_amp else '❌'} AMP")
@@ -409,11 +415,12 @@ class Demo:
         while True:
             randn = torch.randint(1, 10000000, (1,)).item()
             options = f'{"a8" if use_adamw8bit else ""}{"AM" if use_amp else ""}{"xf" if use_xformers else ""}{"gc" if use_gradient_checkpointing else ""}'
-            save_path = f"models/{prompt.lower().replace(' ', '')}_{train_method}_ng{neg_guidance}_lr{lr}_iter{iterations}_seed{seed}_{options}__{randn}.pt"
             if not os.path.exists(save_path):
                 break
             # repeat until a not-in-use path is found
         validation_prompts = [] if validation_prompts is None else [p for p in validation_prompts.split('\n') if len(p)>0]
         sample_positive_prompts = [] if sample_positive_prompts is None else [p for p in sample_positive_prompts.split('\n') if len(p)>0]
         sample_negative_prompts = [] if sample_negative_prompts is None else sample_negative_prompts.split('\n')
@@ -425,7 +432,7 @@ class Demo:
             self.training = True
             self.train_cancel_button.update(interactive=True)
             batch_size = 1 # other batch sizes are non-functional
-            save_path = train(repo_id_or_path, img_size, prompt, modules, frozen, iterations, neg_guidance, lr, save_path,
                   use_adamw8bit, use_xformers, use_amp, use_gradient_checkpointing,
                   seed=int(seed), save_every_n_steps=int(save_every),
                               batch_size=int(batch_size), sample_batch_size=int(sample_batch_size),
@@ -476,6 +483,11 @@ class Demo:
         model_path = model_map[model_name]
         checkpoint = torch.load(model_path)
         self.diffuser = StableDiffuser(scheduler='DDIM', repo_id_or_path=base_repo_id_or_path).to('cuda').eval().half()
         finetuner = FineTunedModel.from_checkpoint(self.diffuser, checkpoint).eval().half()

 import os
 model_map = {}
+model_names_list = []
+def populate_global_model_map():
     global model_map
+    global model_names_list
     for model_file in os.listdir('models'):
         path = 'models/' + model_file
         if any([existing_path == path for existing_path in model_map.values()]):
             continue
         model_map[model_file] = path
+    model_names_list.clear()
+    model_names_list.extend(model_map.keys())
+populate_global_model_map()
 ORIGINAL_SPACE_ID = 'baulab/Erasing-Concepts-In-Diffusion'
 SPACE_ID = os.getenv('SPACE_ID')
                     with gr.Column(scale=1):
+                        self.base_repo_id_or_path_input_infr = gr.Text(
+                            label="Base model",
+                            value="CompVis/stable-diffusion-v1-4",
+                            info="Path or huggingface repo id of the base model that this edit was done against"
+                        )
                         self.prompt_input_infr = gr.Text(
                             placeholder="Enter prompt...",
                             label="Prompt",
                                 interactive=True
                             )
                     with gr.Column(scale=2):
                         self.infr_button = gr.Button(
                             info="Image size for training, should match the model's native image size"
                         )
+                        self.train_prompts_input = gr.Text(
+                            placeholder="Enter prompts, one per line",
+                            label="Prompts to Erase",
+                            info="Prompts corresponding to concepts to erase, one per line"
                         )
                         choices = ['ESD-x', 'ESD-self', 'ESD-u']
                             choices=choices,
                             value='ESD-x',
                             label='Train Method',
+                            info='Method of training. ESD-x uses the least VRAM, and you may get OOM errors with the other methods.'
                         )
                         self.neg_guidance_input = gr.Number(
                             value='',
                             info="Negative prompts for use when generating sample images. One for each positive prompt, or leave empty for none."
                         )
+                        with gr.Row():
+                            self.train_sample_batch_size_input = gr.Slider(
+                                value=1,
+                                step=1,
+                                minimum=1,
+                                maximum=32,
+                                label="Sample generation batch size",
+                                info="Batch size for sample generation, larger needs more VRAM"
+                            )
+                            self.train_validate_every_n_steps = gr.Number(
+                                label="Validate Every N Steps",
+                                value=20,
+                                info="Validation and sample generation will be run at intervals of this many steps"
+                            )
                     with gr.Column(scale=1):
         train_event = self.train_button.click(self.train, inputs = [
             self.train_model_input,
             self.train_img_size_input,
+            self.train_prompts_input,
             self.train_method_input,
             self.neg_guidance_input,
             self.iterations_input,
     def reload_models(self, model_dropdown):
         current_model_name = model_dropdown
+        populate_global_model_map()
+        global model_names_list
+        return [self.model_dropdown.update(choices=model_names_list, value=current_model_name)]
     def cancel_training(self):
         if self.training:
             print("cancellation requested...")
         return [gr.update(value="Cancelling...", interactive=True)]
+    def train(self, repo_id_or_path, img_size, prompts, train_method, neg_guidance, iterations, lr,
               use_adamw8bit=True, use_xformers=False, use_amp=False, use_gradient_checkpointing=False,
               seed=-1, save_every=-1, sample_batch_size=1,
               validation_prompts: str=None, sample_positive_prompts: str=None, sample_negative_prompts: str=None, validate_every_n_steps=-1,
         :param repo_id_or_path:
         :param img_size:
+        :param prompts:
         :param train_method:
         :param neg_guidance:
         :param iterations:
         if self.training:
             return [gr.update(interactive=True, value='Train'), gr.update(value='Someone else is training... Try again soon'), None, gr.update()]
+        print(f"Training {repo_id_or_path} at {img_size} to remove '{prompts}'.")
         print(f"  {train_method}, negative guidance {neg_guidance}, lr {lr}, {iterations} iterations.")
         print(f" {'✅' if use_gradient_checkpointing else '❌'} gradient checkpointing")
         print(f" {'✅' if use_amp else '❌'} AMP")
         while True:
             randn = torch.randint(1, 10000000, (1,)).item()
             options = f'{"a8" if use_adamw8bit else ""}{"AM" if use_amp else ""}{"xf" if use_xformers else ""}{"gc" if use_gradient_checkpointing else ""}'
+            save_path = f"models/{prompts[0].lower().replace(' ', '')}_{train_method}_ng{neg_guidance}_lr{lr}_iter{iterations}_seed{seed}_{options}__{randn}.pt"
             if not os.path.exists(save_path):
                 break
             # repeat until a not-in-use path is found
+        prompts = [p for p in prompts.split('\n') if len(p)>0]
         validation_prompts = [] if validation_prompts is None else [p for p in validation_prompts.split('\n') if len(p)>0]
         sample_positive_prompts = [] if sample_positive_prompts is None else [p for p in sample_positive_prompts.split('\n') if len(p)>0]
         sample_negative_prompts = [] if sample_negative_prompts is None else sample_negative_prompts.split('\n')
             self.training = True
             self.train_cancel_button.update(interactive=True)
             batch_size = 1 # other batch sizes are non-functional
+            save_path = train(repo_id_or_path, img_size, prompts, modules, frozen, iterations, neg_guidance, lr, save_path,
                   use_adamw8bit, use_xformers, use_amp, use_gradient_checkpointing,
                   seed=int(seed), save_every_n_steps=int(save_every),
                               batch_size=int(batch_size), sample_batch_size=int(sample_batch_size),
         model_path = model_map[model_name]
         checkpoint = torch.load(model_path)
+        if type(prompt) is str:
+            prompt = [prompt]
+        if type(negative_prompt) is str:
+            negative_prompt = [negative_prompt]
         self.diffuser = StableDiffuser(scheduler='DDIM', repo_id_or_path=base_repo_id_or_path).to('cuda').eval().half()
         finetuner = FineTunedModel.from_checkpoint(self.diffuser, checkpoint).eval().half()

memory_efficiency.py CHANGED Viewed

@@ -66,10 +66,13 @@ class MemoryEfficiencyWrapper:
             growth_interval=25,
         )
-    def step(self, optimizer, loss):
         self.grad_scaler.scale(loss).backward()
         self.grad_scaler.step(optimizer)
         self.grad_scaler.update()
     def __exit__(self, exc_type, exc_value, tb):
         if exc_type is not None:

             growth_interval=25,
         )
+    def backward(self, loss):
         self.grad_scaler.scale(loss).backward()
+    def step(self, optimizer):
         self.grad_scaler.step(optimizer)
         self.grad_scaler.update()
+        optimizer.zero_grad(set_to_none=True)
     def __exit__(self, exc_type, exc_value, tb):
         if exc_type is not None:

train.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os.path
 import random
 import multiprocessing
 from accelerate.utils import set_seed
 from diffusers import StableDiffusionPipeline
@@ -34,11 +35,13 @@ def validate(diffuser: StableDiffuser, finetuner: FineTunedModel,
         set_seed(validation_seed)
         criteria = torch.nn.MSELoss()
         negative_guidance = 1
-        val_count = 5
         nsteps=50
         num_validation_batches = validation_embeddings.shape[0] // (batch_size*2)
         for i in tqdm(range(num_validation_batches)):
             if training_should_cancel.acquire(block=False):
                 print("cancel requested, bailing")
@@ -58,9 +61,11 @@ def validate(diffuser: StableDiffuser, finetuner: FineTunedModel,
                 loss = criteria(negative_latents, neutral_latents - (negative_guidance*(positive_latents - neutral_latents)))
                 accumulated_loss = (accumulated_loss or 0) + loss.item()
             logger.add_scalar(f"loss/val_{i}", accumulated_loss/val_count, global_step=global_step)
-        num_sample_batches = sample_embeddings.shape[0] // (sample_batch_size*2)
         for i in tqdm(range(0, num_sample_batches)):
             print(f'making sample batch {i}...')
             if training_should_cancel.acquire(block=False):
@@ -82,9 +87,9 @@ def validate(diffuser: StableDiffuser, finetuner: FineTunedModel,
                 images = pipeline(prompt_embeds=batch_prompt_embeds, #sample_embeddings[i*2+1:i*2+2],
                                   negative_prompt_embeds=batch_negative_prompt_embeds, # sample_embeddings[i*2:i*2+1],
                                   num_inference_steps=50)
-                for j in range(sample_batch_size):
-                    image_tensor = transforms.ToTensor()(images.images[j])
-                    logger.add_image(f"samples/{i*sample_batch_size+j}", img_tensor=image_tensor, global_step=global_step)
             """
             with finetuner, torch.cuda.amp.autocast(enabled=use_amp):
@@ -97,20 +102,12 @@ def validate(diffuser: StableDiffuser, finetuner: FineTunedModel,
         torch.cuda.empty_cache()
-def train(repo_id_or_path, img_size, prompt, modules, freeze_modules, iterations, negative_guidance, lr, save_path,
           use_adamw8bit=True, use_xformers=True, use_amp=True, use_gradient_checkpointing=False, seed=-1,
           batch_size=1, sample_batch_size=1,
           save_every_n_steps=-1, validate_every_n_steps=-1,
           validation_prompts=[], sample_positive_prompts=[], sample_negative_prompts=[]):
-    diffuser = None
-    loss = None
-    optimizer = None
-    finetuner = None
-    negative_latents = None
-    neutral_latents = None
-    positive_latents = None
     nsteps = 50
     print(f"using img_size of {img_size}")
     diffuser = StableDiffuser(scheduler='DDIM', repo_id_or_path=repo_id_or_path, native_img_size=img_size).to('cuda')
@@ -118,7 +115,7 @@ def train(repo_id_or_path, img_size, prompt, modules, freeze_modules, iterations
     memory_efficiency_wrapper = MemoryEfficiencyWrapper(diffuser=diffuser, use_amp=use_amp, use_xformers=use_xformers,
                                                         use_gradient_checkpointing=use_gradient_checkpointing )
-    with memory_efficiency_wrapper:
         diffuser.train()
         finetuner = FineTunedModel(diffuser, modules, frozen_modules=freeze_modules)
         if use_adamw8bit:
@@ -139,7 +136,7 @@ def train(repo_id_or_path, img_size, prompt, modules, freeze_modules, iterations
         with torch.no_grad():
             neutral_text_embeddings = diffuser.get_cond_and_uncond_embeddings([''], n_imgs=1)
-            positive_text_embeddings = diffuser.get_cond_and_uncond_embeddings([prompt], n_imgs=1)
             validation_embeddings = diffuser.get_cond_and_uncond_embeddings(validation_prompts, n_imgs=1)
             sample_embeddings = diffuser.get_cond_and_uncond_embeddings(sample_positive_prompts, sample_negative_prompts, n_imgs=1)
@@ -173,51 +170,68 @@ def train(repo_id_or_path, img_size, prompt, modules, freeze_modules, iterations
         start_loss = None
         max_prev_loss_count = 10
         try:
-            for i in pbar:
-                if training_should_cancel.acquire(block=False):
-                    print("cancel requested, bailing")
-                    return None
-                with torch.no_grad():
-                    optimizer.zero_grad()
-                    iteration = torch.randint(1, nsteps - 1, (1,)).item()
-                    with finetuner:
-                        diffused_latents = get_diffused_latents(diffuser, nsteps, positive_text_embeddings, iteration, use_amp)
-                    iteration = int(iteration / nsteps * 1000)
-                    with autocast(enabled=use_amp):
-                        positive_latents = diffuser.predict_noise(iteration, diffused_latents, positive_text_embeddings, guidance_scale=1)
-                        neutral_latents = diffuser.predict_noise(iteration, diffused_latents, neutral_text_embeddings, guidance_scale=1)
-                with finetuner:
-                    with autocast(enabled=use_amp):
-                        negative_latents = diffuser.predict_noise(iteration, diffused_latents, positive_text_embeddings, guidance_scale=1)
-                positive_latents.requires_grad = False
-                neutral_latents.requires_grad = False
-                # loss = criteria(e_n, e_0) works the best try 5000 epochs
-                loss = criteria(negative_latents, neutral_latents - (negative_guidance*(positive_latents - neutral_latents)))
-                memory_efficiency_wrapper.step(optimizer, loss)
-                optimizer.zero_grad()
-                logger.add_scalar("loss", loss.item(), global_step=i)
-                # print moving average loss
-                prev_losses.append(loss.detach().clone())
-                if len(prev_losses) > max_prev_loss_count:
-                    prev_losses.pop(0)
-                if start_loss is None:
-                    start_loss = prev_losses[-1]
-                if len(prev_losses) >= max_prev_loss_count:
-                    moving_average_loss = sum(prev_losses) / len(prev_losses)
-                    print(
-                        f"step {i}: loss={loss.item()} (avg={moving_average_loss.item()}, start ∆={(moving_average_loss - start_loss).item()}")
-                else:
-                    print(f"step {i}: loss={loss.item()}")
                 if save_every_n_steps > 0 and ((i+1) % save_every_n_steps) == 0:
                     torch.save(finetuner.state_dict(), save_path + f"__step_{i+1}.pt")
@@ -231,7 +245,7 @@ def train(repo_id_or_path, img_size, prompt, modules, freeze_modules, iterations
             torch.save(finetuner.state_dict(), save_path)
             return save_path
         finally:
-            del diffuser, loss, optimizer, finetuner, negative_latents, neutral_latents, positive_latents
             torch.cuda.empty_cache()

 import os.path
 import random
 import multiprocessing
+import math
 from accelerate.utils import set_seed
 from diffusers import StableDiffusionPipeline
         set_seed(validation_seed)
         criteria = torch.nn.MSELoss()
         negative_guidance = 1
         nsteps=50
         num_validation_batches = validation_embeddings.shape[0] // (batch_size*2)
+        val_count = max(1, 5 // num_validation_batches)
+        val_total_loss = 0
         for i in tqdm(range(num_validation_batches)):
             if training_should_cancel.acquire(block=False):
                 print("cancel requested, bailing")
                 loss = criteria(negative_latents, neutral_latents - (negative_guidance*(positive_latents - neutral_latents)))
                 accumulated_loss = (accumulated_loss or 0) + loss.item()
+                val_total_loss += loss.item()
             logger.add_scalar(f"loss/val_{i}", accumulated_loss/val_count, global_step=global_step)
+        logger.add_scalar(f"loss/_val_all_combined", val_total_loss/(val_count*num_validation_batches), global_step=global_step)
+        num_sample_batches = int(math.ceil(sample_embeddings.shape[0] / (sample_batch_size*2)))
         for i in tqdm(range(0, num_sample_batches)):
             print(f'making sample batch {i}...')
             if training_should_cancel.acquire(block=False):
                 images = pipeline(prompt_embeds=batch_prompt_embeds, #sample_embeddings[i*2+1:i*2+2],
                                   negative_prompt_embeds=batch_negative_prompt_embeds, # sample_embeddings[i*2:i*2+1],
                                   num_inference_steps=50)
+                for image_index, image in enumerate(images.images):
+                    image_tensor = transforms.ToTensor()(image)
+                    logger.add_image(f"samples/{i*sample_batch_size+image_index}", img_tensor=image_tensor, global_step=global_step)
             """
             with finetuner, torch.cuda.amp.autocast(enabled=use_amp):
         torch.cuda.empty_cache()
+def train(repo_id_or_path, img_size, prompts, modules, freeze_modules, iterations, negative_guidance, lr, save_path,
           use_adamw8bit=True, use_xformers=True, use_amp=True, use_gradient_checkpointing=False, seed=-1,
           batch_size=1, sample_batch_size=1,
           save_every_n_steps=-1, validate_every_n_steps=-1,
           validation_prompts=[], sample_positive_prompts=[], sample_negative_prompts=[]):
     nsteps = 50
     print(f"using img_size of {img_size}")
     diffuser = StableDiffuser(scheduler='DDIM', repo_id_or_path=repo_id_or_path, native_img_size=img_size).to('cuda')
     memory_efficiency_wrapper = MemoryEfficiencyWrapper(diffuser=diffuser, use_amp=use_amp, use_xformers=use_xformers,
                                                         use_gradient_checkpointing=use_gradient_checkpointing )
+    with (((((memory_efficiency_wrapper))))):
         diffuser.train()
         finetuner = FineTunedModel(diffuser, modules, frozen_modules=freeze_modules)
         if use_adamw8bit:
         with torch.no_grad():
             neutral_text_embeddings = diffuser.get_cond_and_uncond_embeddings([''], n_imgs=1)
+            all_positive_text_embeddings = diffuser.get_cond_and_uncond_embeddings(prompts, n_imgs=1)
             validation_embeddings = diffuser.get_cond_and_uncond_embeddings(validation_prompts, n_imgs=1)
             sample_embeddings = diffuser.get_cond_and_uncond_embeddings(sample_positive_prompts, sample_negative_prompts, n_imgs=1)
         start_loss = None
         max_prev_loss_count = 10
         try:
+            loss=None
+            negative_latents=None
+            neutral_latents=None
+            positive_latents=None
+            num_prompts = all_positive_text_embeddings.shape[0] // 2
+            for i in pbar:
+                try:
+                    loss = None
+                    negative_latents = None
+                    positive_latents = None
+                    neutral_latents = None
+                    diffused_latents = None
+                    for j in tqdm(range(num_prompts)):
+                        positive_text_embeddings = all_positive_text_embeddings[j*2:j*2+2]
+                        if training_should_cancel.acquire(block=False):
+                            print("cancel requested, bailing")
+                            return None
+                        with torch.no_grad():
+                            optimizer.zero_grad()
+                            iteration = torch.randint(1, nsteps - 1, (1,)).item()
+                            with finetuner:
+                                diffused_latents = get_diffused_latents(diffuser, nsteps, positive_text_embeddings, iteration, use_amp)
+                            iteration = int(iteration / nsteps * 1000)
+                            with autocast(enabled=use_amp):
+                                positive_latents = diffuser.predict_noise(iteration, diffused_latents, positive_text_embeddings, guidance_scale=1)
+                                neutral_latents = diffuser.predict_noise(iteration, diffused_latents, neutral_text_embeddings, guidance_scale=1)
+                        with finetuner:
+                            with autocast(enabled=use_amp):
+                                negative_latents = diffuser.predict_noise(iteration, diffused_latents, positive_text_embeddings, guidance_scale=1)
+                        positive_latents.requires_grad = False
+                        neutral_latents.requires_grad = False
+                        # loss = criteria(e_n, e_0) works the best try 5000 epochs
+                        loss = criteria(negative_latents, neutral_latents - (negative_guidance*(positive_latents - neutral_latents)))
+                        memory_efficiency_wrapper.backward(loss)
+                    logger.add_scalar("loss", loss.item(), global_step=i)
+                    # print moving average loss
+                    prev_losses.append(loss.detach().clone())
+                    if len(prev_losses) > max_prev_loss_count:
+                        prev_losses.pop(0)
+                    if start_loss is None:
+                        start_loss = prev_losses[-1]
+                    if len(prev_losses) >= max_prev_loss_count:
+                        moving_average_loss = sum(prev_losses) / len(prev_losses)
+                        print(
+                            f"step {i}: loss={loss.item()} (avg={moving_average_loss.item()}, start ∆={(moving_average_loss - start_loss).item()}")
+                    else:
+                        print(f"step {i}: loss={loss.item()}")
+                    memory_efficiency_wrapper.step(optimizer)
+                finally:
+                    del loss, negative_latents, positive_latents, neutral_latents, diffused_latents
                 if save_every_n_steps > 0 and ((i+1) % save_every_n_steps) == 0:
                     torch.save(finetuner.state_dict(), save_path + f"__step_{i+1}.pt")
             torch.save(finetuner.state_dict(), save_path)
             return save_path
         finally:
+            del diffuser, optimizer, finetuner
             torch.cuda.empty_cache()