Spaces:

teticio
/

audio-diffusion

Runtime error

App Files Files Community

teticio commited on Oct 14, 2022

Commit

8aa7c27

1 Parent(s): d76bdef

convert to hf model

Browse files

Files changed (10) hide show

.gitignore +6 -5
README.md +12 -14
audiodiffusion/utils.py +363 -0
accelerate_deepspeed.yaml → config/accelerate_deepspeed.yaml +0 -0
accelerate_local.yaml → config/accelerate_local.yaml +0 -0
accelerate_sagemaker.yaml → config/accelerate_sagemaker.yaml +0 -0
ldm_autoencoder_kl.yaml → config/ldm_autoencoder_kl.yaml +0 -1
audio_to_images.py → scripts/audio_to_images.py +0 -0
train_unconditional.py → scripts/train_unconditional.py +0 -0
train_vae.py → scripts/train_vae.py +65 -20

.gitignore CHANGED Viewed

@@ -3,9 +3,10 @@ __pycache__
 .ipynb_checkpoints
 data*
 ddpm-ema-audio-*
-flagged/
-build/
 audiodiffusion.egg-info
-lightning_logs/
-taming/
-checkpoints/

 .ipynb_checkpoints
 data*
 ddpm-ema-audio-*
+flagged
+build
 audiodiffusion.egg-info
+lightning_logs
+taming
+checkpoints
+vae_model

README.md CHANGED Viewed

@@ -45,20 +45,23 @@ You can play around with some pretrained models on [Google Colab](https://colab.
 ---
 ## Generate Mel spectrogram dataset from directory of audio files
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
 ```bash
-python audio_to_images.py \
   --resolution 64 \
   --hop_length 1024 \
   --input_dir path-to-audio-files \
   --output_dir data-test
 ```
 #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
 ```bash
-python audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
   --output_dir data-256 \
@@ -66,10 +69,9 @@ python audio_to_images.py \
 ```
 ## Train model
 #### Run training on local machine.
 ```bash
-accelerate launch --config_file accelerate_local.yaml \
-  train_unconditional.py \
   --dataset_name data-64 \
   --resolution 64 \
   --hop_length 1024 \
@@ -81,12 +83,10 @@ accelerate launch --config_file accelerate_local.yaml \
   --lr_warmup_steps 500 \
   --mixed_precision no
 ```
 #### Run training on local machine with `batch_size` of 2 and `gradient_accumulation_steps` 8 to compensate, so that 256x256 resolution model fits on commercial grade GPU and push to hub.
 ```bash
-accelerate launch --config_file accelerate_local.yaml \
-  train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir latent-audio-diffusion-256 \
@@ -101,12 +101,10 @@ accelerate launch --config_file accelerate_local.yaml \
   --hub_model_id latent-audio-diffusion-256 \
   --hub_token $(cat $HOME/.huggingface/token)
 ```
 #### Run training on SageMaker.
 ```bash
-accelerate launch --config_file accelerate_sagemaker.yaml \
-  strain_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \

 ---
 ## Generate Mel spectrogram dataset from directory of audio files
+#### Install
+```bash
+pip install .
+```
 #### Training can be run with Mel spectrograms of resolution 64x64 on a single commercial grade GPU (e.g. RTX 2080 Ti). The `hop_length` should be set to 1024 for better results.
 ```bash
+python scripts/audio_to_images.py \
   --resolution 64 \
   --hop_length 1024 \
   --input_dir path-to-audio-files \
   --output_dir data-test
 ```
 #### Generate dataset of 256x256 Mel spectrograms and push to hub (you will need to be authenticated with `huggingface-cli login`).
 ```bash
+python scripts/audio_to_images.py \
   --resolution 256 \
   --input_dir path-to-audio-files \
   --output_dir data-256 \
 ```
 ## Train model
 #### Run training on local machine.
 ```bash
+accelerate launch --config_file config/accelerate_local.yaml \
+  scripts/train_unconditional.py \
   --dataset_name data-64 \
   --resolution 64 \
   --hop_length 1024 \
   --lr_warmup_steps 500 \
   --mixed_precision no
 ```
 #### Run training on local machine with `batch_size` of 2 and `gradient_accumulation_steps` 8 to compensate, so that 256x256 resolution model fits on commercial grade GPU and push to hub.
 ```bash
+accelerate launch --config_file config/accelerate_local.yaml \
+  scripts/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir latent-audio-diffusion-256 \
   --hub_model_id latent-audio-diffusion-256 \
   --hub_token $(cat $HOME/.huggingface/token)
 ```
 #### Run training on SageMaker.
 ```bash
+accelerate launch --config_file config/accelerate_sagemaker.yaml \
+  scripts/train_unconditional.py \
   --dataset_name teticio/audio-diffusion-256 \
   --resolution 256 \
   --output_dir ddpm-ema-audio-256 \

audiodiffusion/utils.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# adpated from https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py
+import torch
+from diffusers import AutoencoderKL
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(paths,
+                         checkpoint,
+                         old_checkpoint,
+                         attention_paths_to_split=None,
+                         additional_replacements=None,
+                         config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1,
+                            channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels //
+                                             num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"],
+                                            replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.ddconfig
+    _ = original_config.model.params.embed_dim
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[
+        "encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[
+        "encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
+        "encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[
+        "encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
+        "encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
+        "encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[
+        "decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[
+        "decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
+        "decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[
+        "decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
+        "decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
+        "decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict[
+        "post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict[
+        "post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({
+        ".".join(layer.split(".")[:3])
+        for layer in vae_state_dict if "encoder.down" in layer
+    })
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
+        for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({
+        ".".join(layer.split(".")[:3])
+        for layer in vae_state_dict if "decoder.up" in layer
+    })
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
+        for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [
+            key for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key
+        ]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                    f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                    f"encoder.down.{i}.downsample.conv.bias")
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            "old": f"down.{i}.block",
+            "new": f"down_blocks.{i}.resnets"
+        }
+        assign_to_checkpoint(paths,
+                             new_checkpoint,
+                             vae_state_dict,
+                             additional_replacements=[meta_path],
+                             config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [
+            key for key in mid_resnets if f"encoder.mid.block_{i}" in key
+        ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            "old": f"mid.block_{i}",
+            "new": f"mid_block.resnets.{i - 1}"
+        }
+        assign_to_checkpoint(paths,
+                             new_checkpoint,
+                             vae_state_dict,
+                             additional_replacements=[meta_path],
+                             config=config)
+    mid_attentions = [
+        key for key in vae_state_dict if "encoder.mid.attn" in key
+    ]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths,
+                         new_checkpoint,
+                         vae_state_dict,
+                         additional_replacements=[meta_path],
+                         config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                    f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                    f"decoder.up.{block_id}.upsample.conv.bias"]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            "old": f"up.{block_id}.block",
+            "new": f"up_blocks.{i}.resnets"
+        }
+        assign_to_checkpoint(paths,
+                             new_checkpoint,
+                             vae_state_dict,
+                             additional_replacements=[meta_path],
+                             config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [
+            key for key in mid_resnets if f"decoder.mid.block_{i}" in key
+        ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {
+            "old": f"mid.block_{i}",
+            "new": f"mid_block.resnets.{i - 1}"
+        }
+        assign_to_checkpoint(paths,
+                             new_checkpoint,
+                             vae_state_dict,
+                             additional_replacements=[meta_path],
+                             config=config)
+    mid_attentions = [
+        key for key in vae_state_dict if "decoder.mid.attn" in key
+    ]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths,
+                         new_checkpoint,
+                         vae_state_dict,
+                         additional_replacements=[meta_path],
+                         config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_to_hf_vae(ldm_checkpoint, ldm_config, hf_checkpoint):
+    checkpoint = torch.load(ldm_checkpoint)["state_dict"]
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(ldm_config)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(
+        checkpoint, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    vae.save_pretrained(hf_checkpoint)

accelerate_deepspeed.yaml → config/accelerate_deepspeed.yaml RENAMED Viewed

File without changes

accelerate_local.yaml → config/accelerate_local.yaml RENAMED Viewed

File without changes

accelerate_sagemaker.yaml → config/accelerate_sagemaker.yaml RENAMED Viewed

File without changes

ldm_autoencoder_kl.yaml → config/ldm_autoencoder_kl.yaml RENAMED Viewed

@@ -27,6 +27,5 @@ model:
 lightning:
   trainer:
     benchmark: True
-    accumulate_grad_batches: 24
     accelerator: gpu
     devices: 1

 lightning:
   trainer:
     benchmark: True
     accelerator: gpu
     devices: 1

audio_to_images.py → scripts/audio_to_images.py RENAMED Viewed

File without changes

train_unconditional.py → scripts/train_unconditional.py RENAMED Viewed

File without changes

train_vae.py → scripts/train_vae.py RENAMED Viewed

@@ -4,7 +4,8 @@
 # TODO
 # grayscale
-# convert to huggingface / train huggingface
 import os
 import argparse
@@ -15,21 +16,26 @@ import numpy as np
 from PIL import Image
 import pytorch_lightning as pl
 from omegaconf import OmegaConf
-from datasets import load_dataset
 from librosa.util import normalize
 from ldm.util import instantiate_from_config
 from pytorch_lightning.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from audiodiffusion.mel import Mel
 class AudioDiffusion(Dataset):
     def __init__(self, model_id):
         super().__init__()
-        self.hf_dataset = load_dataset(model_id)['train']
     def __len__(self):
         return len(self.hf_dataset)
@@ -65,11 +71,8 @@ class ImageLogger(Callback):
                        hop_length=hop_length)
         self.every = every
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch,
-                           batch_idx):
-        if (batch_idx + 1) % self.every != 0:
-            return
         pl_module.eval()
         with torch.no_grad():
             images = pl_module.log_images(batch, split='train')
@@ -96,27 +99,69 @@ class ImageLogger(Callback):
                     global_step=pl_module.global_step,
                     sample_rate=self.mel.get_sample_rate())
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Train VAE using ldm.")
-    parser.add_argument("--batch_size", type=int, default=1)
     args = parser.parse_args()
-    config = OmegaConf.load('ldm_autoencoder_kl.yaml')
     lightning_config = config.pop("lightning", OmegaConf.create())
     trainer_config = lightning_config.get("trainer", OmegaConf.create())
     trainer_opt = argparse.Namespace(**trainer_config)
-    trainer = Trainer.from_argparse_args(trainer_opt,
-                                         callbacks=[
-                                             ImageLogger(),
-                                             ModelCheckpoint(
-                                                 dirpath='checkpoints',
-                                                 filename='{epoch:06}',
-                                                 verbose=True,
-                                                 save_last=True)
-                                         ])
     model = instantiate_from_config(config.model)
     model.learning_rate = config.model.base_learning_rate
-    data = AudioDiffusionDataModule('teticio/audio-diffusion-256',
                                     batch_size=args.batch_size)
     trainer.fit(model, data)

 # TODO
 # grayscale
+# add vae to train_uncond (no_grad)
+# update README
 import os
 import argparse
 from PIL import Image
 import pytorch_lightning as pl
 from omegaconf import OmegaConf
 from librosa.util import normalize
 from ldm.util import instantiate_from_config
 from pytorch_lightning.trainer import Trainer
 from torch.utils.data import DataLoader, Dataset
+from datasets import load_from_disk, load_dataset
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
+from pytorch_lightning.utilities.distributed import rank_zero_only
 from audiodiffusion.mel import Mel
+from audiodiffusion.utils import convert_ldm_to_hf_vae
 class AudioDiffusion(Dataset):
     def __init__(self, model_id):
         super().__init__()
+        if os.path.exists(model_id):
+            self.hf_dataset = load_from_disk(model_id)['train']
+        else:
+            self.hf_dataset = load_dataset(model_id)['train']
     def __len__(self):
         return len(self.hf_dataset)
                        hop_length=hop_length)
         self.every = every
+    @rank_zero_only
+    def log_images_and_audios(self, pl_module, batch):
         pl_module.eval()
         with torch.no_grad():
             images = pl_module.log_images(batch, split='train')
                     global_step=pl_module.global_step,
                     sample_rate=self.mel.get_sample_rate())
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch,
+                           batch_idx):
+        if (batch_idx + 1) % self.every != 0:
+            return
+        self.log_images_and_audios(pl_module, batch)
+class HFModelCheckpoint(ModelCheckpoint):
+    def __init__(self, ldm_config, hf_checkpoint='vae_model', *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ldm_config = ldm_config
+        self.hf_checkpoint = hf_checkpoint
+    def on_train_epoch_end(self, trainer, pl_module):
+        super().on_train_epoch_end(trainer, pl_module)
+        ldm_checkpoint = self.format_checkpoint_name(
+            {'epoch': trainer.current_epoch})
+        convert_ldm_to_hf_vae(ldm_checkpoint, self.ldm_config,
+                              self.hf_checkpoint)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Train VAE using ldm.")
+    parser.add_argument("-d", "--dataset_name", type=str, default=None)
+    parser.add_argument("-b", "--batch_size", type=int, default=1)
+    parser.add_argument("-c",
+                        "--ldm_config_file",
+                        type=str,
+                        default="config/ldm_autoencoder_kl.yaml")
+    parser.add_argument("--ldm_checkpoint_dir",
+                        type=str,
+                        default="checkpoints")
+    parser.add_argument("--hf_checkpoint_dir", type=str, default="vae_model")
+    parser.add_argument("-r",
+                        "--resume_from_checkpoint",
+                        type=str,
+                        default=None)
+    parser.add_argument("-g",
+                        "--gradient_accumulation_steps",
+                        type=int,
+                        default=1)
     args = parser.parse_args()
+    config = OmegaConf.load(args.ldm_config_file)
     lightning_config = config.pop("lightning", OmegaConf.create())
     trainer_config = lightning_config.get("trainer", OmegaConf.create())
+    trainer_config.accumulate_grad_batches = args.gradient_accumulation_steps
     trainer_opt = argparse.Namespace(**trainer_config)
+    trainer = Trainer.from_argparse_args(
+        trainer_opt,
+        resume_from_checkpoint=args.resume_from_checkpoint,
+        callbacks=[
+            ImageLogger(),
+            HFModelCheckpoint(ldm_config=config,
+                              hf_checkpoint=args.hf_checkpoint_dir,
+                              dirpath=args.ldm_checkpoint_dir,
+                              filename='{epoch:06}',
+                              verbose=True,
+                              save_last=True)
+        ])
     model = instantiate_from_config(config.model)
     model.learning_rate = config.model.base_learning_rate
+    data = AudioDiffusionDataModule(args.dataset_name,
                                     batch_size=args.batch_size)
     trainer.fit(model, data)